commit ee5ddba378f5173d26a1ffcae5b5ef7585790ebc
Author: ModelHub XC <noreply@modelhub.org.cn>
Date:   Tue May 12 12:06:33 2026 +0800

    初始化项目，由ModelHub XC社区提供模型
    
    Model: LLucass/TT_L0.2_H0.2_grpo
    Source: Original Platform

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..478a303
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,40 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-50/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-150/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..89f1efb
--- /dev/null
+++ b/README.md
@@ -0,0 +1,70 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+datasets: knoveleng/open-rs
+library_name: transformers
+model_name: TT_L0.2_H0.2_grpo
+tags:
+- generated_from_trainer
+- open-r1
+- trl
+- grpo
+licence: license
+---
+
+# Model Card for TT_L0.2_H0.2_grpo
+
+This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) on the [knoveleng/open-rs](https://huggingface.co/datasets/knoveleng/open-rs) dataset.
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="LLucass/TT_L0.2_H0.2_grpo", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/lavatorywang-nus/uncertainty/runs/9gj0wo7b) 
+
+
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+
+### Framework versions
+
+- TRL: 0.16.0.dev0
+- Transformers: 4.51.3
+- Pytorch: 2.5.1
+- Datasets: 3.6.0
+- Tokenizers: 0.21.1
+
+## Citations
+
+Cite GRPO as:
+
+```bibtex
+@article{zhihong2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+
+```
+
+Cite TRL as:
+    
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```
\ No newline at end of file
diff --git a/all_results.json b/all_results.json
new file mode 100644
index 0000000..2c27fe5
--- /dev/null
+++ b/all_results.json
@@ -0,0 +1,8 @@
+{
+    "total_flos": 0.0,
+    "train_loss": 3.2957177609205244e-09,
+    "train_runtime": 10011.2078,
+    "train_samples": 7000,
+    "train_samples_per_second": 1.279,
+    "train_steps_per_second": 0.02
+}
\ No newline at end of file
diff --git a/checkpoint-100/config.json b/checkpoint-100/config.json
new file mode 100644
index 0000000..78fed5b
--- /dev/null
+++ b/checkpoint-100/config.json
@@ -0,0 +1,29 @@
+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000,
+  "sliding_window": 4096,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": false,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
diff --git a/checkpoint-100/generation_config.json b/checkpoint-100/generation_config.json
new file mode 100644
index 0000000..92878bd
--- /dev/null
+++ b/checkpoint-100/generation_config.json
@@ -0,0 +1,9 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 151646,
+  "do_sample": true,
+  "eos_token_id": 151643,
+  "temperature": 0.6,
+  "top_p": 0.95,
+  "transformers_version": "4.51.3"
+}
diff --git a/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000..e4cbb55
--- /dev/null
+++ b/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64d03fed17192dbd02910a453a129df1e59f0dc56bb9beb6c12fb15fd5a9c1de
+size 5331274140
diff --git a/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000..f1bc753
--- /dev/null
+++ b/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f2840c47421f7c64caa11da52c0631d0552552ce75a900497c6cac91258525f
+size 5331276572
diff --git a/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000..c099b37
--- /dev/null
+++ b/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3aa79f325e1d1a7ccdd958c5099c83a7b19d5914ea40bc4442eaa921c39f052a
+size 5331276892
diff --git a/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000..d51e3cf
--- /dev/null
+++ b/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99cca373a8c3f9a69295ac81f0febe4fbb3d279ceeec4d3de63743860f11bf47
+size 5331273884
diff --git a/checkpoint-100/global_step100/mp_rank_00_model_states.pt b/checkpoint-100/global_step100/mp_rank_00_model_states.pt
new file mode 100644
index 0000000..f82ca69
--- /dev/null
+++ b/checkpoint-100/global_step100/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e065897058412099161e568c02a323c6b14f44d563d9449bf43b5440be39c020
+size 3554267640
diff --git a/checkpoint-100/latest b/checkpoint-100/latest
new file mode 100644
index 0000000..744ae7d
--- /dev/null
+++ b/checkpoint-100/latest
@@ -0,0 +1 @@
+global_step100
\ No newline at end of file
diff --git a/checkpoint-100/model.safetensors b/checkpoint-100/model.safetensors
new file mode 100644
index 0000000..1fb28a4
--- /dev/null
+++ b/checkpoint-100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8af2fd262f18c1787af93c8074f28997a45321d177fa5cd04da9d1387b9d7563
+size 3554214752
diff --git a/checkpoint-100/rng_state_0.pth b/checkpoint-100/rng_state_0.pth
new file mode 100644
index 0000000..f388e2d
--- /dev/null
+++ b/checkpoint-100/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be658a6ef1d1c437376e35713827537091c6f33daa9df25eaa9e98991c241626
+size 14960
diff --git a/checkpoint-100/rng_state_1.pth b/checkpoint-100/rng_state_1.pth
new file mode 100644
index 0000000..b8609e2
--- /dev/null
+++ b/checkpoint-100/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aed16eab73104db9391a6f908bd2c021091c25d0695a238c0b37b35f57381747
+size 14960
diff --git a/checkpoint-100/rng_state_2.pth b/checkpoint-100/rng_state_2.pth
new file mode 100644
index 0000000..3faa9f7
--- /dev/null
+++ b/checkpoint-100/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3391aacd8861abedf83b6b4f8fd56e9bda6ca4b7e62a19f0364fc11c5fb64740
+size 14960
diff --git a/checkpoint-100/rng_state_3.pth b/checkpoint-100/rng_state_3.pth
new file mode 100644
index 0000000..a42199a
--- /dev/null
+++ b/checkpoint-100/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:985fd6b7842de914d0250ac9bd68ec694b87f6fc3fae0cbcb3027f4ca123d938
+size 14960
diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt
new file mode 100644
index 0000000..256e515
--- /dev/null
+++ b/checkpoint-100/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b803ddcc2d4f1857fc5f89309ee5e404cb000416c8d7f3e16508a024742ba28a
+size 1064
diff --git a/checkpoint-100/special_tokens_map.json b/checkpoint-100/special_tokens_map.json
new file mode 100644
index 0000000..1d385d6
--- /dev/null
+++ b/checkpoint-100/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/checkpoint-100/tokenizer.json b/checkpoint-100/tokenizer.json
new file mode 100644
index 0000000..e7cd2c1
--- /dev/null
+++ b/checkpoint-100/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a
+size 11422959
diff --git a/checkpoint-100/tokenizer_config.json b/checkpoint-100/tokenizer_config.json
new file mode 100644
index 0000000..ef6e98c
--- /dev/null
+++ b/checkpoint-100/tokenizer_config.json
@@ -0,0 +1,195 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\\n'}}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json
new file mode 100644
index 0000000..6590f9c
--- /dev/null
+++ b/checkpoint-100/trainer_state.json
@@ -0,0 +1,2734 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.11428571428571428,
+  "eval_steps": 500,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1734.0,
+      "completions/mean_length": 1702.03125,
+      "completions/mean_terminated_length": 993.6190795898438,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "epoch": 0.001142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2544386684894562,
+      "learning_rate": 0.0,
+      "loss": -0.0,
+      "num_tokens": 118418.0,
+      "reward": 0.17899775505065918,
+      "reward_std": 0.7650213241577148,
+      "rewards/cosine_scaled_reward/mean": -0.09800112992525101,
+      "rewards/cosine_scaled_reward/std": 0.37953105568885803,
+      "rewards/format_reward/mean": 0.375,
+      "rewards/format_reward/std": 0.48795005679130554,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1894.0,
+      "completions/mean_length": 1738.90625,
+      "completions/mean_terminated_length": 949.0,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "epoch": 0.002285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2436082512140274,
+      "learning_rate": 5e-08,
+      "loss": -0.0,
+      "num_tokens": 239748.0,
+      "reward": 0.3848632574081421,
+      "reward_std": 0.9111153483390808,
+      "rewards/cosine_scaled_reward/mean": 0.020556632429361343,
+      "rewards/cosine_scaled_reward/std": 0.4492928683757782,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1545.0,
+      "completions/mean_length": 1989.015625,
+      "completions/mean_terminated_length": 1104.25,
+      "completions/min_length": 706.0,
+      "completions/min_terminated_length": 706.0,
+      "epoch": 0.0034285714285714284,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2544717788696289,
+      "learning_rate": 1e-07,
+      "loss": -0.0,
+      "num_tokens": 377517.0,
+      "reward": -0.3279358148574829,
+      "reward_std": 0.33216947317123413,
+      "rewards/cosine_scaled_reward/mean": -0.20303040742874146,
+      "rewards/cosine_scaled_reward/std": 0.179075226187706,
+      "rewards/format_reward/mean": 0.078125,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2033.0,
+      "completions/mean_length": 1566.421875,
+      "completions/mean_terminated_length": 1084.84375,
+      "completions/min_length": 502.0,
+      "completions/min_terminated_length": 502.0,
+      "epoch": 0.004571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28807103633880615,
+      "learning_rate": 1.5e-07,
+      "loss": -0.0,
+      "num_tokens": 487576.0,
+      "reward": 0.2716121971607208,
+      "reward_std": 0.6643469333648682,
+      "rewards/cosine_scaled_reward/mean": -0.12981891632080078,
+      "rewards/cosine_scaled_reward/std": 0.3019586503505707,
+      "rewards/format_reward/mean": 0.53125,
+      "rewards/format_reward/std": 0.5029674172401428,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.890625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1807.0,
+      "completions/mean_length": 1936.84375,
+      "completions/mean_terminated_length": 1031.71435546875,
+      "completions/min_length": 463.0,
+      "completions/min_terminated_length": 463.0,
+      "epoch": 0.005714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26783761382102966,
+      "learning_rate": 2e-07,
+      "loss": -0.0,
+      "num_tokens": 622350.0,
+      "reward": -0.3612896800041199,
+      "reward_std": 0.41048353910446167,
+      "rewards/cosine_scaled_reward/mean": -0.23533234000205994,
+      "rewards/cosine_scaled_reward/std": 0.20467400550842285,
+      "rewards/format_reward/mean": 0.109375,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1301.0,
+      "completions/mean_length": 1889.453125,
+      "completions/mean_terminated_length": 779.625,
+      "completions/min_length": 530.0,
+      "completions/min_terminated_length": 530.0,
+      "epoch": 0.006857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.262518972158432,
+      "learning_rate": 2.5e-07,
+      "loss": 0.0,
+      "num_tokens": 754923.0,
+      "reward": -0.29250282049179077,
+      "reward_std": 0.5422531962394714,
+      "rewards/cosine_scaled_reward/mean": -0.22437641024589539,
+      "rewards/cosine_scaled_reward/std": 0.22509199380874634,
+      "rewards/format_reward/mean": 0.15625,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1568.0,
+      "completions/mean_length": 1921.921875,
+      "completions/mean_terminated_length": 1314.45458984375,
+      "completions/min_length": 927.0,
+      "completions/min_terminated_length": 927.0,
+      "epoch": 0.008,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22601397335529327,
+      "learning_rate": 3e-07,
+      "loss": 0.0,
+      "num_tokens": 888334.0,
+      "reward": 0.025340259075164795,
+      "reward_std": 0.7285393476486206,
+      "rewards/cosine_scaled_reward/mean": -0.1279548704624176,
+      "rewards/cosine_scaled_reward/std": 0.40222346782684326,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2009.0,
+      "completions/mean_length": 1736.859375,
+      "completions/mean_terminated_length": 999.9473876953125,
+      "completions/min_length": 305.0,
+      "completions/min_terminated_length": 305.0,
+      "epoch": 0.009142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24552854895591736,
+      "learning_rate": 3.5e-07,
+      "loss": 0.0,
+      "num_tokens": 1009909.0,
+      "reward": 0.21729671955108643,
+      "reward_std": 0.6989120244979858,
+      "rewards/cosine_scaled_reward/mean": -0.055414143949747086,
+      "rewards/cosine_scaled_reward/std": 0.47493892908096313,
+      "rewards/format_reward/mean": 0.328125,
+      "rewards/format_reward/std": 0.4732423722743988,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1963.0,
+      "completions/mean_length": 1967.53125,
+      "completions/mean_terminated_length": 1475.77783203125,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "epoch": 0.010285714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2430322915315628,
+      "learning_rate": 4e-07,
+      "loss": 0.0,
+      "num_tokens": 1147287.0,
+      "reward": -0.21451422572135925,
+      "reward_std": 0.587526798248291,
+      "rewards/cosine_scaled_reward/mean": -0.19319462776184082,
+      "rewards/cosine_scaled_reward/std": 0.29357606172561646,
+      "rewards/format_reward/mean": 0.171875,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1966.0,
+      "completions/mean_length": 1708.546875,
+      "completions/mean_terminated_length": 961.75,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "epoch": 0.011428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2543582320213318,
+      "learning_rate": 4.5e-07,
+      "loss": 0.0,
+      "num_tokens": 1267466.0,
+      "reward": 0.02539752423763275,
+      "reward_std": 0.545810341835022,
+      "rewards/cosine_scaled_reward/mean": -0.14355123043060303,
+      "rewards/cosine_scaled_reward/std": 0.36147356033325195,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.90625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1579.0,
+      "completions/mean_length": 1967.734375,
+      "completions/mean_terminated_length": 1191.8333740234375,
+      "completions/min_length": 843.0,
+      "completions/min_terminated_length": 843.0,
+      "epoch": 0.012571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24583907425403595,
+      "learning_rate": 5e-07,
+      "loss": -0.0,
+      "num_tokens": 1405073.0,
+      "reward": -0.46971434354782104,
+      "reward_std": 0.36104393005371094,
+      "rewards/cosine_scaled_reward/mean": -0.28173214197158813,
+      "rewards/cosine_scaled_reward/std": 0.17775526642799377,
+      "rewards/format_reward/mean": 0.09375,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1862.0,
+      "completions/mean_length": 1707.5625,
+      "completions/mean_terminated_length": 1176.47998046875,
+      "completions/min_length": 330.0,
+      "completions/min_terminated_length": 330.0,
+      "epoch": 0.013714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3135142922401428,
+      "learning_rate": 5.5e-07,
+      "loss": -0.0,
+      "num_tokens": 1525301.0,
+      "reward": 0.0018395520746707916,
+      "reward_std": 0.7012988328933716,
+      "rewards/cosine_scaled_reward/mean": -0.21783021092414856,
+      "rewards/cosine_scaled_reward/std": 0.324150949716568,
+      "rewards/format_reward/mean": 0.4375,
+      "rewards/format_reward/std": 0.5,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1745.0,
+      "completions/mean_length": 1841.96875,
+      "completions/mean_terminated_length": 1168.933349609375,
+      "completions/min_length": 442.0,
+      "completions/min_terminated_length": 442.0,
+      "epoch": 0.014857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2532394826412201,
+      "learning_rate": 6e-07,
+      "loss": -0.0,
+      "num_tokens": 1654227.0,
+      "reward": -0.10322706401348114,
+      "reward_std": 0.6915165185928345,
+      "rewards/cosine_scaled_reward/mean": -0.17661353945732117,
+      "rewards/cosine_scaled_reward/std": 0.329875111579895,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2017.0,
+      "completions/mean_length": 1816.390625,
+      "completions/mean_terminated_length": 1306.8499755859375,
+      "completions/min_length": 520.0,
+      "completions/min_terminated_length": 520.0,
+      "epoch": 0.016,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28405147790908813,
+      "learning_rate": 6.5e-07,
+      "loss": 0.0,
+      "num_tokens": 1781084.0,
+      "reward": 0.10602855682373047,
+      "reward_std": 0.630502462387085,
+      "rewards/cosine_scaled_reward/mean": -0.11104822158813477,
+      "rewards/cosine_scaled_reward/std": 0.3846627473831177,
+      "rewards/format_reward/mean": 0.328125,
+      "rewards/format_reward/std": 0.4732423722743988,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1574.0,
+      "completions/mean_length": 1702.109375,
+      "completions/mean_terminated_length": 818.1666870117188,
+      "completions/min_length": 411.0,
+      "completions/min_terminated_length": 411.0,
+      "epoch": 0.017142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28779250383377075,
+      "learning_rate": 7e-07,
+      "loss": 0.0,
+      "num_tokens": 1900939.0,
+      "reward": 0.32734519243240356,
+      "reward_std": 0.3870265483856201,
+      "rewards/cosine_scaled_reward/mean": 0.007422588765621185,
+      "rewards/cosine_scaled_reward/std": 0.45787373185157776,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 2048.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 2048.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.018285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2337152510881424,
+      "learning_rate": 7.5e-07,
+      "loss": -0.0,
+      "num_tokens": 2042451.0,
+      "reward": -0.5429925918579102,
+      "reward_std": 0.3153150975704193,
+      "rewards/cosine_scaled_reward/mean": -0.2714962661266327,
+      "rewards/cosine_scaled_reward/std": 0.1678173691034317,
+      "rewards/format_reward/mean": 0.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1879.0,
+      "completions/mean_length": 1564.921875,
+      "completions/mean_terminated_length": 858.8846435546875,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "epoch": 0.019428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33599403500556946,
+      "learning_rate": 8e-07,
+      "loss": -0.0,
+      "num_tokens": 2153126.0,
+      "reward": 0.17696775496006012,
+      "reward_std": 0.6489306688308716,
+      "rewards/cosine_scaled_reward/mean": -0.11464111506938934,
+      "rewards/cosine_scaled_reward/std": 0.3551919758319855,
+      "rewards/format_reward/mean": 0.40625,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1298.0,
+      "completions/mean_length": 1795.390625,
+      "completions/mean_terminated_length": 893.21435546875,
+      "completions/min_length": 619.0,
+      "completions/min_terminated_length": 619.0,
+      "epoch": 0.02057142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22697053849697113,
+      "learning_rate": 8.499999999999999e-07,
+      "loss": -0.0,
+      "num_tokens": 2278407.0,
+      "reward": -0.10711958259344101,
+      "reward_std": 0.5238703489303589,
+      "rewards/cosine_scaled_reward/mean": -0.1785597801208496,
+      "rewards/cosine_scaled_reward/std": 0.2545098662376404,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1949.0,
+      "completions/mean_length": 1921.484375,
+      "completions/mean_terminated_length": 1238.300048828125,
+      "completions/min_length": 623.0,
+      "completions/min_terminated_length": 623.0,
+      "epoch": 0.021714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23972108960151672,
+      "learning_rate": 9e-07,
+      "loss": 0.0,
+      "num_tokens": 2412638.0,
+      "reward": 0.029344379901885986,
+      "reward_std": 0.6719281077384949,
+      "rewards/cosine_scaled_reward/mean": -0.086890310049057,
+      "rewards/cosine_scaled_reward/std": 0.40220555663108826,
+      "rewards/format_reward/mean": 0.203125,
+      "rewards/format_reward/std": 0.40550529956817627,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2025.0,
+      "completions/mean_length": 1728.5625,
+      "completions/mean_terminated_length": 845.4117431640625,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "epoch": 0.022857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23309311270713806,
+      "learning_rate": 9.499999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 2534618.0,
+      "reward": 0.0131673663854599,
+      "reward_std": 0.4436222314834595,
+      "rewards/cosine_scaled_reward/mean": -0.13404130935668945,
+      "rewards/cosine_scaled_reward/std": 0.32819250226020813,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1923.0,
+      "completions/mean_length": 1777.953125,
+      "completions/mean_terminated_length": 1087.8333740234375,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "epoch": 0.024,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29990270733833313,
+      "learning_rate": 1e-06,
+      "loss": -0.0,
+      "num_tokens": 2659215.0,
+      "reward": -0.1764472872018814,
+      "reward_std": 0.5121938586235046,
+      "rewards/cosine_scaled_reward/mean": -0.2444736361503601,
+      "rewards/cosine_scaled_reward/std": 0.289971262216568,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.390625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1935.0,
+      "completions/mean_length": 1361.28125,
+      "completions/mean_terminated_length": 921.0769653320312,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 0.025142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29922786355018616,
+      "learning_rate": 9.99931462820376e-07,
+      "loss": -0.0,
+      "num_tokens": 2755353.0,
+      "reward": 0.6089149713516235,
+      "reward_std": 0.5986809730529785,
+      "rewards/cosine_scaled_reward/mean": -0.05491749942302704,
+      "rewards/cosine_scaled_reward/std": 0.39076483249664307,
+      "rewards/format_reward/mean": 0.71875,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2043.0,
+      "completions/mean_length": 1565.046875,
+      "completions/mean_terminated_length": 903.2222290039062,
+      "completions/min_length": 405.0,
+      "completions/min_terminated_length": 405.0,
+      "epoch": 0.026285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27512773871421814,
+      "learning_rate": 9.997258721585931e-07,
+      "loss": -0.0,
+      "num_tokens": 2866308.0,
+      "reward": 0.21871733665466309,
+      "reward_std": 0.5976030826568604,
+      "rewards/cosine_scaled_reward/mean": -0.10157884657382965,
+      "rewards/cosine_scaled_reward/std": 0.3856185972690582,
+      "rewards/format_reward/mean": 0.421875,
+      "rewards/format_reward/std": 0.49776285886764526,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1947.0,
+      "completions/mean_length": 1801.671875,
+      "completions/mean_terminated_length": 1259.75,
+      "completions/min_length": 573.0,
+      "completions/min_terminated_length": 573.0,
+      "epoch": 0.027428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22642865777015686,
+      "learning_rate": 9.993832906395582e-07,
+      "loss": -0.0,
+      "num_tokens": 2992543.0,
+      "reward": 0.04899948835372925,
+      "reward_std": 0.8525694608688354,
+      "rewards/cosine_scaled_reward/mean": -0.17081275582313538,
+      "rewards/cosine_scaled_reward/std": 0.3993513882160187,
+      "rewards/format_reward/mean": 0.390625,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2020.0,
+      "completions/mean_length": 1715.765625,
+      "completions/mean_terminated_length": 1035.4761962890625,
+      "completions/min_length": 436.0,
+      "completions/min_terminated_length": 436.0,
+      "epoch": 0.02857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25316134095191956,
+      "learning_rate": 9.989038226169207e-07,
+      "loss": -0.0,
+      "num_tokens": 3112648.0,
+      "reward": 0.10585837811231613,
+      "reward_std": 0.7828943729400635,
+      "rewards/cosine_scaled_reward/mean": -0.11894579976797104,
+      "rewards/cosine_scaled_reward/std": 0.4141720235347748,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1964.0,
+      "completions/mean_length": 1917.703125,
+      "completions/mean_terminated_length": 1452.357177734375,
+      "completions/min_length": 840.0,
+      "completions/min_terminated_length": 840.0,
+      "epoch": 0.029714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2521306574344635,
+      "learning_rate": 9.982876141412855e-07,
+      "loss": -0.0,
+      "num_tokens": 3246013.0,
+      "reward": 0.17620250582695007,
+      "reward_std": 0.6548349857330322,
+      "rewards/cosine_scaled_reward/mean": -0.08377375453710556,
+      "rewards/cosine_scaled_reward/std": 0.3527655303478241,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1990.0,
+      "completions/mean_length": 1851.015625,
+      "completions/mean_terminated_length": 1147.5,
+      "completions/min_length": 506.0,
+      "completions/min_terminated_length": 506.0,
+      "epoch": 0.030857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2730060815811157,
+      "learning_rate": 9.975348529157229e-07,
+      "loss": -0.0,
+      "num_tokens": 3374766.0,
+      "reward": -0.18854813277721405,
+      "reward_std": 0.49348777532577515,
+      "rewards/cosine_scaled_reward/mean": -0.21146157383918762,
+      "rewards/cosine_scaled_reward/std": 0.2601618766784668,
+      "rewards/format_reward/mean": 0.234375,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2023.0,
+      "completions/mean_length": 1798.328125,
+      "completions/mean_terminated_length": 1049.3125,
+      "completions/min_length": 403.0,
+      "completions/min_terminated_length": 403.0,
+      "epoch": 0.032,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2566036880016327,
+      "learning_rate": 9.96645768238595e-07,
+      "loss": 0.0,
+      "num_tokens": 3500195.0,
+      "reward": 0.06705980002880096,
+      "reward_std": 0.7090284824371338,
+      "rewards/cosine_scaled_reward/mean": -0.10709509253501892,
+      "rewards/cosine_scaled_reward/std": 0.4101051986217499,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1988.0,
+      "completions/mean_length": 1930.203125,
+      "completions/mean_terminated_length": 1210.3333740234375,
+      "completions/min_length": 582.0,
+      "completions/min_terminated_length": 582.0,
+      "epoch": 0.03314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25197461247444153,
+      "learning_rate": 9.956206309337066e-07,
+      "loss": 0.0,
+      "num_tokens": 3634200.0,
+      "reward": -0.2462695688009262,
+      "reward_std": 0.5237302780151367,
+      "rewards/cosine_scaled_reward/mean": -0.2012597918510437,
+      "rewards/cosine_scaled_reward/std": 0.23252712190151215,
+      "rewards/format_reward/mean": 0.15625,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1900.0,
+      "completions/mean_length": 1847.65625,
+      "completions/mean_terminated_length": 1061.6923828125,
+      "completions/min_length": 421.0,
+      "completions/min_terminated_length": 421.0,
+      "epoch": 0.03428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30431485176086426,
+      "learning_rate": 9.944597532678119e-07,
+      "loss": 0.0,
+      "num_tokens": 3762986.0,
+      "reward": -0.05392302945256233,
+      "reward_std": 0.7249555587768555,
+      "rewards/cosine_scaled_reward/mean": -0.15196150541305542,
+      "rewards/cosine_scaled_reward/std": 0.34566983580589294,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1860.0,
+      "completions/mean_length": 1838.671875,
+      "completions/mean_terminated_length": 931.5833740234375,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "epoch": 0.03542857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2484513372182846,
+      "learning_rate": 9.931634888554935e-07,
+      "loss": 0.0,
+      "num_tokens": 3891157.0,
+      "reward": -0.11271396279335022,
+      "reward_std": 0.6705260872840881,
+      "rewards/cosine_scaled_reward/mean": -0.1813569962978363,
+      "rewards/cosine_scaled_reward/std": 0.4071698486804962,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1715.0,
+      "completions/mean_length": 1910.109375,
+      "completions/mean_terminated_length": 1417.6429443359375,
+      "completions/min_length": 906.0,
+      "completions/min_terminated_length": 906.0,
+      "epoch": 0.036571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25329527258872986,
+      "learning_rate": 9.917322325514487e-07,
+      "loss": -0.0,
+      "num_tokens": 4023756.0,
+      "reward": -0.08931556344032288,
+      "reward_std": 0.6381070613861084,
+      "rewards/cosine_scaled_reward/mean": -0.16965776681900024,
+      "rewards/cosine_scaled_reward/std": 0.37385129928588867,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.953125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1865.0,
+      "completions/mean_length": 2023.71875,
+      "completions/mean_terminated_length": 1530.0,
+      "completions/min_length": 1107.0,
+      "completions/min_terminated_length": 1107.0,
+      "epoch": 0.037714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22758109867572784,
+      "learning_rate": 9.901664203302124e-07,
+      "loss": 0.0,
+      "num_tokens": 4164490.0,
+      "reward": -0.4589868187904358,
+      "reward_std": 0.5177067518234253,
+      "rewards/cosine_scaled_reward/mean": -0.2919934093952179,
+      "rewards/cosine_scaled_reward/std": 0.2252870500087738,
+      "rewards/format_reward/mean": 0.125,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.453125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1999.0,
+      "completions/mean_length": 1454.78125,
+      "completions/mean_terminated_length": 963.2571411132812,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "epoch": 0.038857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3234354257583618,
+      "learning_rate": 9.88466529153356e-07,
+      "loss": 0.0,
+      "num_tokens": 4267148.0,
+      "reward": 0.656031608581543,
+      "reward_std": 0.7529654502868652,
+      "rewards/cosine_scaled_reward/mean": 0.05457830801606178,
+      "rewards/cosine_scaled_reward/std": 0.49684229493141174,
+      "rewards/format_reward/mean": 0.546875,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1724.0,
+      "completions/mean_length": 1819.078125,
+      "completions/mean_terminated_length": 716.0909423828125,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "epoch": 0.04,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2821458876132965,
+      "learning_rate": 9.866330768241983e-07,
+      "loss": -0.0,
+      "num_tokens": 4395065.0,
+      "reward": -0.09630556404590607,
+      "reward_std": 0.7089139223098755,
+      "rewards/cosine_scaled_reward/mean": -0.15752778947353363,
+      "rewards/cosine_scaled_reward/std": 0.3647947609424591,
+      "rewards/format_reward/mean": 0.21875,
+      "rewards/format_reward/std": 0.4166666865348816,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1811.0,
+      "completions/mean_length": 1954.34375,
+      "completions/mean_terminated_length": 1382.0,
+      "completions/min_length": 949.0,
+      "completions/min_terminated_length": 949.0,
+      "epoch": 0.04114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24163897335529327,
+      "learning_rate": 9.846666218300807e-07,
+      "loss": -0.0,
+      "num_tokens": 4531255.0,
+      "reward": -0.34593287110328674,
+      "reward_std": 0.44493502378463745,
+      "rewards/cosine_scaled_reward/mean": -0.24327893555164337,
+      "rewards/cosine_scaled_reward/std": 0.24784433841705322,
+      "rewards/format_reward/mean": 0.140625,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1723.0,
+      "completions/mean_length": 1868.921875,
+      "completions/mean_terminated_length": 1092.916748046875,
+      "completions/min_length": 620.0,
+      "completions/min_terminated_length": 620.0,
+      "epoch": 0.04228571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24795544147491455,
+      "learning_rate": 9.825677631722435e-07,
+      "loss": -0.0,
+      "num_tokens": 4661890.0,
+      "reward": -0.23053905367851257,
+      "reward_std": 0.34036368131637573,
+      "rewards/cosine_scaled_reward/mean": -0.2246445268392563,
+      "rewards/cosine_scaled_reward/std": 0.15942412614822388,
+      "rewards/format_reward/mean": 0.21875,
+      "rewards/format_reward/std": 0.4166666865348816,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1397.0,
+      "completions/mean_length": 1889.53125,
+      "completions/mean_terminated_length": 1033.800048828125,
+      "completions/min_length": 810.0,
+      "completions/min_terminated_length": 810.0,
+      "epoch": 0.04342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24283826351165771,
+      "learning_rate": 9.80337140183366e-07,
+      "loss": 0.0,
+      "num_tokens": 4794532.0,
+      "reward": -0.10043507814407349,
+      "reward_std": 0.47925832867622375,
+      "rewards/cosine_scaled_reward/mean": -0.13615503907203674,
+      "rewards/cosine_scaled_reward/std": 0.3336707651615143,
+      "rewards/format_reward/mean": 0.171875,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1515.0,
+      "completions/mean_length": 1644.828125,
+      "completions/mean_terminated_length": 689.9473876953125,
+      "completions/min_length": 279.0,
+      "completions/min_terminated_length": 279.0,
+      "epoch": 0.044571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28362998366355896,
+      "learning_rate": 9.779754323328192e-07,
+      "loss": 0.0,
+      "num_tokens": 4910585.0,
+      "reward": 0.12284853309392929,
+      "reward_std": 0.4183085858821869,
+      "rewards/cosine_scaled_reward/mean": -0.11045074462890625,
+      "rewards/cosine_scaled_reward/std": 0.30217844247817993,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1632.0,
+      "completions/mean_length": 1618.28125,
+      "completions/mean_terminated_length": 902.0833740234375,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "epoch": 0.045714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.262617826461792,
+      "learning_rate": 9.754833590196926e-07,
+      "loss": 0.0,
+      "num_tokens": 5024227.0,
+      "reward": 0.2076582908630371,
+      "reward_std": 0.42125773429870605,
+      "rewards/cosine_scaled_reward/mean": -0.12273336946964264,
+      "rewards/cosine_scaled_reward/std": 0.4404613971710205,
+      "rewards/format_reward/mean": 0.453125,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1914.0,
+      "completions/mean_length": 1717.734375,
+      "completions/mean_terminated_length": 1235.0384521484375,
+      "completions/min_length": 664.0,
+      "completions/min_terminated_length": 664.0,
+      "epoch": 0.046857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23294499516487122,
+      "learning_rate": 9.728616793536587e-07,
+      "loss": -0.0,
+      "num_tokens": 5145314.0,
+      "reward": 0.011502981185913086,
+      "reward_std": 0.6816084980964661,
+      "rewards/cosine_scaled_reward/mean": -0.22081100940704346,
+      "rewards/cosine_scaled_reward/std": 0.37589573860168457,
+      "rewards/format_reward/mean": 0.453125,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1672.0,
+      "completions/mean_length": 1703.921875,
+      "completions/mean_terminated_length": 579.933349609375,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 0.048,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34672290086746216,
+      "learning_rate": 9.701111919237408e-07,
+      "loss": -0.0,
+      "num_tokens": 5264725.0,
+      "reward": -0.2616002857685089,
+      "reward_std": 0.37952175736427307,
+      "rewards/cosine_scaled_reward/mean": -0.26361262798309326,
+      "rewards/cosine_scaled_reward/std": 0.17531204223632812,
+      "rewards/format_reward/mean": 0.265625,
+      "rewards/format_reward/std": 0.44515693187713623,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1370.0,
+      "completions/mean_length": 1681.84375,
+      "completions/mean_terminated_length": 814.631591796875,
+      "completions/min_length": 308.0,
+      "completions/min_terminated_length": 308.0,
+      "epoch": 0.04914285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.263967901468277,
+      "learning_rate": 9.672327345550543e-07,
+      "loss": -0.0,
+      "num_tokens": 5383979.0,
+      "reward": 0.13376155495643616,
+      "reward_std": 0.46012288331985474,
+      "rewards/cosine_scaled_reward/mean": -0.08155670762062073,
+      "rewards/cosine_scaled_reward/std": 0.3612325191497803,
+      "rewards/format_reward/mean": 0.296875,
+      "rewards/format_reward/std": 0.4604927599430084,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.640625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1830.0,
+      "completions/mean_length": 1624.625,
+      "completions/mean_terminated_length": 869.9130859375,
+      "completions/min_length": 385.0,
+      "completions/min_terminated_length": 385.0,
+      "epoch": 0.05028571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28927963972091675,
+      "learning_rate": 9.64227184053598e-07,
+      "loss": -0.0,
+      "num_tokens": 5498651.0,
+      "reward": 0.20869271457195282,
+      "reward_std": 0.5558150410652161,
+      "rewards/cosine_scaled_reward/mean": -0.0987786278128624,
+      "rewards/cosine_scaled_reward/std": 0.42912590503692627,
+      "rewards/format_reward/mean": 0.40625,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.921875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1851.0,
+      "completions/mean_length": 2006.96875,
+      "completions/mean_terminated_length": 1522.800048828125,
+      "completions/min_length": 955.0,
+      "completions/min_terminated_length": 955.0,
+      "epoch": 0.05142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24254000186920166,
+      "learning_rate": 9.610954559391704e-07,
+      "loss": 0.0,
+      "num_tokens": 5638753.0,
+      "reward": -0.2540697157382965,
+      "reward_std": 0.4600578844547272,
+      "rewards/cosine_scaled_reward/mean": -0.20515984296798706,
+      "rewards/cosine_scaled_reward/std": 0.3251590430736542,
+      "rewards/format_reward/mean": 0.15625,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1563.0,
+      "completions/mean_length": 1765.984375,
+      "completions/mean_terminated_length": 919.9375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "epoch": 0.052571428571428575,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2645930349826813,
+      "learning_rate": 9.578385041664925e-07,
+      "loss": 0.0,
+      "num_tokens": 5762944.0,
+      "reward": -0.213707834482193,
+      "reward_std": 0.38778313994407654,
+      "rewards/cosine_scaled_reward/mean": -0.2318539321422577,
+      "rewards/cosine_scaled_reward/std": 0.21436986327171326,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1993.0,
+      "completions/mean_length": 1583.40625,
+      "completions/mean_terminated_length": 986.0714721679688,
+      "completions/min_length": 436.0,
+      "completions/min_terminated_length": 436.0,
+      "epoch": 0.053714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.311797559261322,
+      "learning_rate": 9.54457320834625e-07,
+      "loss": 0.0,
+      "num_tokens": 5874682.0,
+      "reward": 0.27925533056259155,
+      "reward_std": 0.6467443704605103,
+      "rewards/cosine_scaled_reward/mean": -0.07912233471870422,
+      "rewards/cosine_scaled_reward/std": 0.4737093150615692,
+      "rewards/format_reward/mean": 0.4375,
+      "rewards/format_reward/std": 0.5,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1527.0,
+      "completions/mean_length": 1690.0625,
+      "completions/mean_terminated_length": 1006.727294921875,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "epoch": 0.054857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26644304394721985,
+      "learning_rate": 9.509529358847654e-07,
+      "loss": -0.0,
+      "num_tokens": 5993390.0,
+      "reward": 0.13692031800746918,
+      "reward_std": 0.5655145049095154,
+      "rewards/cosine_scaled_reward/mean": -0.12685233354568481,
+      "rewards/cosine_scaled_reward/std": 0.32320985198020935,
+      "rewards/format_reward/mean": 0.390625,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2021.0,
+      "completions/mean_length": 1387.140625,
+      "completions/mean_terminated_length": 804.0294189453125,
+      "completions/min_length": 300.0,
+      "completions/min_terminated_length": 300.0,
+      "epoch": 0.056,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3078882396221161,
+      "learning_rate": 9.473264167865171e-07,
+      "loss": 0.0,
+      "num_tokens": 6092231.0,
+      "reward": 0.35559189319610596,
+      "reward_std": 0.5927403569221497,
+      "rewards/cosine_scaled_reward/mean": -0.09564155340194702,
+      "rewards/cosine_scaled_reward/std": 0.4046906530857086,
+      "rewards/format_reward/mean": 0.546875,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1830.0,
+      "completions/mean_length": 1674.890625,
+      "completions/mean_terminated_length": 962.5909423828125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "epoch": 0.05714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23925544321537018,
+      "learning_rate": 9.43578868212728e-07,
+      "loss": -0.0,
+      "num_tokens": 6210240.0,
+      "reward": 0.18573230504989624,
+      "reward_std": 0.5264967083930969,
+      "rewards/cosine_scaled_reward/mean": -0.09463384002447128,
+      "rewards/cosine_scaled_reward/std": 0.4100942015647888,
+      "rewards/format_reward/mean": 0.375,
+      "rewards/format_reward/std": 0.48795005679130554,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.421875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2024.0,
+      "completions/mean_length": 1347.40625,
+      "completions/mean_terminated_length": 836.1621704101562,
+      "completions/min_length": 394.0,
+      "completions/min_terminated_length": 394.0,
+      "epoch": 0.05828571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.325811505317688,
+      "learning_rate": 9.397114317029974e-07,
+      "loss": 0.0,
+      "num_tokens": 6306682.0,
+      "reward": 0.1735648661851883,
+      "reward_std": 0.5335988998413086,
+      "rewards/cosine_scaled_reward/mean": -0.21009255945682526,
+      "rewards/cosine_scaled_reward/std": 0.2623959481716156,
+      "rewards/format_reward/mean": 0.59375,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1390.0,
+      "completions/mean_length": 1727.765625,
+      "completions/mean_terminated_length": 767.0625,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 0.05942857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27392977476119995,
+      "learning_rate": 9.357252853159505e-07,
+      "loss": 0.0,
+      "num_tokens": 6428611.0,
+      "reward": -0.16267812252044678,
+      "reward_std": 0.5682471990585327,
+      "rewards/cosine_scaled_reward/mean": -0.2219640612602234,
+      "rewards/cosine_scaled_reward/std": 0.36739134788513184,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1907.0,
+      "completions/mean_length": 1609.171875,
+      "completions/mean_terminated_length": 924.5999755859375,
+      "completions/min_length": 513.0,
+      "completions/min_terminated_length": 513.0,
+      "epoch": 0.060571428571428575,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28155064582824707,
+      "learning_rate": 9.316216432703916e-07,
+      "loss": -0.0,
+      "num_tokens": 6542430.0,
+      "reward": 0.0752667784690857,
+      "reward_std": 0.7118167281150818,
+      "rewards/cosine_scaled_reward/mean": -0.18892911076545715,
+      "rewards/cosine_scaled_reward/std": 0.3222156763076782,
+      "rewards/format_reward/mean": 0.453125,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1986.0,
+      "completions/mean_length": 1588.234375,
+      "completions/mean_terminated_length": 1067.166748046875,
+      "completions/min_length": 519.0,
+      "completions/min_terminated_length": 519.0,
+      "epoch": 0.061714285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2555343806743622,
+      "learning_rate": 9.274017555754407e-07,
+      "loss": 0.0,
+      "num_tokens": 6655221.0,
+      "reward": 0.6341299414634705,
+      "reward_std": 1.0656921863555908,
+      "rewards/cosine_scaled_reward/mean": 0.05143994837999344,
+      "rewards/cosine_scaled_reward/std": 0.5348308086395264,
+      "rewards/format_reward/mean": 0.53125,
+      "rewards/format_reward/std": 0.5029674172401428,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1420.0,
+      "completions/mean_length": 1549.5625,
+      "completions/mean_terminated_length": 821.0769653320312,
+      "completions/min_length": 444.0,
+      "completions/min_terminated_length": 444.0,
+      "epoch": 0.06285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30243629217147827,
+      "learning_rate": 9.230669076497687e-07,
+      "loss": -0.0,
+      "num_tokens": 6764681.0,
+      "reward": 0.13021975755691528,
+      "reward_std": 0.3984764516353607,
+      "rewards/cosine_scaled_reward/mean": -0.13801513612270355,
+      "rewards/cosine_scaled_reward/std": 0.41228073835372925,
+      "rewards/format_reward/mean": 0.40625,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.546875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2040.0,
+      "completions/mean_length": 1633.25,
+      "completions/mean_terminated_length": 1132.689697265625,
+      "completions/min_length": 543.0,
+      "completions/min_terminated_length": 543.0,
+      "epoch": 0.064,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23835402727127075,
+      "learning_rate": 9.186184199300463e-07,
+      "loss": -0.0,
+      "num_tokens": 6880169.0,
+      "reward": 0.27981996536254883,
+      "reward_std": 0.5018116235733032,
+      "rewards/cosine_scaled_reward/mean": -0.10227750986814499,
+      "rewards/cosine_scaled_reward/std": 0.481824666261673,
+      "rewards/format_reward/mean": 0.484375,
+      "rewards/format_reward/std": 0.5037065148353577,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1807.0,
+      "completions/mean_length": 1699.875,
+      "completions/mean_terminated_length": 1156.7999267578125,
+      "completions/min_length": 642.0,
+      "completions/min_terminated_length": 642.0,
+      "epoch": 0.06514285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22349494695663452,
+      "learning_rate": 9.140576474687263e-07,
+      "loss": 0.0,
+      "num_tokens": 7000529.0,
+      "reward": -0.026505012065172195,
+      "reward_std": 0.5785415172576904,
+      "rewards/cosine_scaled_reward/mean": -0.20856501162052155,
+      "rewards/cosine_scaled_reward/std": 0.2749907374382019,
+      "rewards/format_reward/mean": 0.390625,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2044.0,
+      "completions/mean_length": 1457.875,
+      "completions/mean_terminated_length": 1054.105224609375,
+      "completions/min_length": 447.0,
+      "completions/min_terminated_length": 447.0,
+      "epoch": 0.06628571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.261942595243454,
+      "learning_rate": 9.093859795212817e-07,
+      "loss": 0.0,
+      "num_tokens": 7103929.0,
+      "reward": 0.5745843648910522,
+      "reward_std": 0.8671218156814575,
+      "rewards/cosine_scaled_reward/mean": -0.03302033245563507,
+      "rewards/cosine_scaled_reward/std": 0.45529407262802124,
+      "rewards/format_reward/mean": 0.640625,
+      "rewards/format_reward/std": 0.4836103618144989,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.484375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2010.0,
+      "completions/mean_length": 1590.0625,
+      "completions/mean_terminated_length": 1159.8787841796875,
+      "completions/min_length": 591.0,
+      "completions/min_terminated_length": 591.0,
+      "epoch": 0.06742857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24828943610191345,
+      "learning_rate": 9.046048391230247e-07,
+      "loss": -0.0,
+      "num_tokens": 7216157.0,
+      "reward": 0.3377103805541992,
+      "reward_std": 0.5543617010116577,
+      "rewards/cosine_scaled_reward/mean": -0.1045822948217392,
+      "rewards/cosine_scaled_reward/std": 0.39040952920913696,
+      "rewards/format_reward/mean": 0.546875,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1976.0,
+      "completions/mean_length": 1622.84375,
+      "completions/mean_terminated_length": 1076.21435546875,
+      "completions/min_length": 555.0,
+      "completions/min_terminated_length": 555.0,
+      "epoch": 0.06857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2752656936645508,
+      "learning_rate": 8.997156826556369e-07,
+      "loss": -0.0,
+      "num_tokens": 7330907.0,
+      "reward": 0.11114693433046341,
+      "reward_std": 0.6926254034042358,
+      "rewards/cosine_scaled_reward/mean": -0.1788015365600586,
+      "rewards/cosine_scaled_reward/std": 0.39409172534942627,
+      "rewards/format_reward/mean": 0.46875,
+      "rewards/format_reward/std": 0.5029674172401428,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1984.0,
+      "completions/mean_length": 1708.859375,
+      "completions/mean_terminated_length": 1014.4285888671875,
+      "completions/min_length": 411.0,
+      "completions/min_terminated_length": 411.0,
+      "epoch": 0.06971428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22669929265975952,
+      "learning_rate": 8.9471999940354e-07,
+      "loss": -0.0,
+      "num_tokens": 7451794.0,
+      "reward": 0.2345120906829834,
+      "reward_std": 0.6293160319328308,
+      "rewards/cosine_scaled_reward/mean": -0.1093064472079277,
+      "rewards/cosine_scaled_reward/std": 0.29189831018447876,
+      "rewards/format_reward/mean": 0.453125,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2008.0,
+      "completions/mean_length": 1281.53125,
+      "completions/mean_terminated_length": 1004.2978515625,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "epoch": 0.07085714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25438693165779114,
+      "learning_rate": 8.896193111002475e-07,
+      "loss": 0.0,
+      "num_tokens": 7544044.0,
+      "reward": 0.9180847406387329,
+      "reward_std": 0.6390912532806396,
+      "rewards/cosine_scaled_reward/mean": 0.06841734796762466,
+      "rewards/cosine_scaled_reward/std": 0.48315128684043884,
+      "rewards/format_reward/mean": 0.78125,
+      "rewards/format_reward/std": 0.4166666865348816,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.359375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1678.0,
+      "completions/mean_length": 1310.46875,
+      "completions/mean_terminated_length": 896.731689453125,
+      "completions/min_length": 295.0,
+      "completions/min_terminated_length": 295.0,
+      "epoch": 0.072,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28795576095581055,
+      "learning_rate": 8.844151714648274e-07,
+      "loss": -0.0,
+      "num_tokens": 7638170.0,
+      "reward": 0.6488770246505737,
+      "reward_std": 0.7876260876655579,
+      "rewards/cosine_scaled_reward/mean": -0.019311510026454926,
+      "rewards/cosine_scaled_reward/std": 0.4736698865890503,
+      "rewards/format_reward/mean": 0.6875,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1969.0,
+      "completions/mean_length": 1307.625,
+      "completions/mean_terminated_length": 1039.8297119140625,
+      "completions/min_length": 376.0,
+      "completions/min_terminated_length": 376.0,
+      "epoch": 0.07314285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25637197494506836,
+      "learning_rate": 8.791091657286267e-07,
+      "loss": -0.0,
+      "num_tokens": 7732810.0,
+      "reward": 0.8280279636383057,
+      "reward_std": 0.6804471015930176,
+      "rewards/cosine_scaled_reward/mean": 0.015576483681797981,
+      "rewards/cosine_scaled_reward/std": 0.44819310307502747,
+      "rewards/format_reward/mean": 0.796875,
+      "rewards/format_reward/std": 0.40550529956817627,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.359375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1846.0,
+      "completions/mean_length": 1322.125,
+      "completions/mean_terminated_length": 914.9268188476562,
+      "completions/min_length": 297.0,
+      "completions/min_terminated_length": 297.0,
+      "epoch": 0.07428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2944399118423462,
+      "learning_rate": 8.737029101523929e-07,
+      "loss": -0.0,
+      "num_tokens": 7828130.0,
+      "reward": 0.15610456466674805,
+      "reward_std": 0.4606686234474182,
+      "rewards/cosine_scaled_reward/mean": -0.24226020276546478,
+      "rewards/cosine_scaled_reward/std": 0.33131492137908936,
+      "rewards/format_reward/mean": 0.640625,
+      "rewards/format_reward/std": 0.4836103618144989,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1803.0,
+      "completions/mean_length": 1020.21875,
+      "completions/mean_terminated_length": 806.9057006835938,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 0.07542857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.32644009590148926,
+      "learning_rate": 8.681980515339463e-07,
+      "loss": 0.0,
+      "num_tokens": 7903656.0,
+      "reward": 0.7972471714019775,
+      "reward_std": 0.7674820423126221,
+      "rewards/cosine_scaled_reward/mean": -0.031063925474882126,
+      "rewards/cosine_scaled_reward/std": 0.5106223225593567,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1910.0,
+      "completions/mean_length": 1750.859375,
+      "completions/mean_terminated_length": 1142.4285888671875,
+      "completions/min_length": 585.0,
+      "completions/min_terminated_length": 585.0,
+      "epoch": 0.07657142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2270829975605011,
+      "learning_rate": 8.625962667065487e-07,
+      "loss": 0.0,
+      "num_tokens": 8026447.0,
+      "reward": -0.1400720775127411,
+      "reward_std": 0.3325888514518738,
+      "rewards/cosine_scaled_reward/mean": -0.24972353875637054,
+      "rewards/cosine_scaled_reward/std": 0.16404789686203003,
+      "rewards/format_reward/mean": 0.359375,
+      "rewards/format_reward/std": 0.4836103618144989,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1424.0,
+      "completions/mean_length": 769.546875,
+      "completions/mean_terminated_length": 637.2930908203125,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 0.07771428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.37025144696235657,
+      "learning_rate": 8.568992620281243e-07,
+      "loss": -0.0,
+      "num_tokens": 8084954.0,
+      "reward": 0.9792699813842773,
+      "reward_std": 0.804767370223999,
+      "rewards/cosine_scaled_reward/mean": 0.03651002421975136,
+      "rewards/cosine_scaled_reward/std": 0.46041443943977356,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1701.0,
+      "completions/mean_length": 1086.234375,
+      "completions/mean_terminated_length": 886.6226806640625,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 0.07885714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3763800263404846,
+      "learning_rate": 8.511087728614862e-07,
+      "loss": 0.0,
+      "num_tokens": 8164817.0,
+      "reward": 0.35803771018981934,
+      "reward_std": 0.5702667236328125,
+      "rewards/cosine_scaled_reward/mean": -0.24285613000392914,
+      "rewards/cosine_scaled_reward/std": 0.3019825220108032,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1898.0,
+      "completions/mean_length": 1463.375,
+      "completions/mean_terminated_length": 1112.5999755859375,
+      "completions/min_length": 503.0,
+      "completions/min_terminated_length": 503.0,
+      "epoch": 0.08,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24232418835163116,
+      "learning_rate": 8.452265630457282e-07,
+      "loss": -0.0,
+      "num_tokens": 8269929.0,
+      "reward": 0.3703588843345642,
+      "reward_std": 0.7288752794265747,
+      "rewards/cosine_scaled_reward/mean": -0.1351330280303955,
+      "rewards/cosine_scaled_reward/std": 0.3751916289329529,
+      "rewards/format_reward/mean": 0.640625,
+      "rewards/format_reward/std": 0.4836103618144989,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1991.0,
+      "completions/mean_length": 1409.859375,
+      "completions/mean_terminated_length": 973.2368774414062,
+      "completions/min_length": 404.0,
+      "completions/min_terminated_length": 404.0,
+      "epoch": 0.08114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.300010621547699,
+      "learning_rate": 8.392544243589427e-07,
+      "loss": 0.0,
+      "num_tokens": 8370880.0,
+      "reward": 0.5196826457977295,
+      "reward_std": 0.7097917795181274,
+      "rewards/cosine_scaled_reward/mean": -0.044846177101135254,
+      "rewards/cosine_scaled_reward/std": 0.508389949798584,
+      "rewards/format_reward/mean": 0.609375,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1801.0,
+      "completions/mean_length": 1228.046875,
+      "completions/mean_terminated_length": 931.4680786132812,
+      "completions/min_length": 331.0,
+      "completions/min_terminated_length": 331.0,
+      "epoch": 0.08228571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30454304814338684,
+      "learning_rate": 8.331941759724268e-07,
+      "loss": -0.0,
+      "num_tokens": 8459827.0,
+      "reward": 0.41365131735801697,
+      "reward_std": 0.5005639791488647,
+      "rewards/cosine_scaled_reward/mean": -0.1759868562221527,
+      "rewards/cosine_scaled_reward/std": 0.19868774712085724,
+      "rewards/format_reward/mean": 0.765625,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1719.0,
+      "completions/mean_length": 1513.28125,
+      "completions/mean_terminated_length": 1192.4500732421875,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "epoch": 0.08342857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27848970890045166,
+      "learning_rate": 8.270476638965461e-07,
+      "loss": -0.0,
+      "num_tokens": 8567405.0,
+      "reward": 0.09570223093032837,
+      "reward_std": 0.5445049405097961,
+      "rewards/cosine_scaled_reward/mean": -0.2802739143371582,
+      "rewards/cosine_scaled_reward/std": 0.25603488087654114,
+      "rewards/format_reward/mean": 0.65625,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1888.0,
+      "completions/mean_length": 1240.125,
+      "completions/mean_terminated_length": 924.0,
+      "completions/min_length": 530.0,
+      "completions/min_terminated_length": 530.0,
+      "epoch": 0.08457142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2789021134376526,
+      "learning_rate": 8.208167604184217e-07,
+      "loss": 0.0,
+      "num_tokens": 8656701.0,
+      "reward": 0.7823752760887146,
+      "reward_std": 0.6479132175445557,
+      "rewards/cosine_scaled_reward/mean": 0.031812600791454315,
+      "rewards/cosine_scaled_reward/std": 0.5397623181343079,
+      "rewards/format_reward/mean": 0.71875,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2036.0,
+      "completions/mean_length": 1455.953125,
+      "completions/mean_terminated_length": 1186.8409423828125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "epoch": 0.08571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22443196177482605,
+      "learning_rate": 8.145033635316128e-07,
+      "loss": 0.0,
+      "num_tokens": 8760842.0,
+      "reward": 0.8040015697479248,
+      "reward_std": 0.5675323009490967,
+      "rewards/cosine_scaled_reward/mean": 0.027000809088349342,
+      "rewards/cosine_scaled_reward/std": 0.5096040964126587,
+      "rewards/format_reward/mean": 0.75,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1720.0,
+      "completions/mean_length": 1177.859375,
+      "completions/mean_terminated_length": 863.1276245117188,
+      "completions/min_length": 372.0,
+      "completions/min_terminated_length": 372.0,
+      "epoch": 0.08685714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.32647648453712463,
+      "learning_rate": 8.081093963579707e-07,
+      "loss": 0.0,
+      "num_tokens": 8846625.0,
+      "reward": 0.310506671667099,
+      "reward_std": 0.5110941529273987,
+      "rewards/cosine_scaled_reward/mean": -0.2119341641664505,
+      "rewards/cosine_scaled_reward/std": 0.24737994372844696,
+      "rewards/format_reward/mean": 0.734375,
+      "rewards/format_reward/std": 0.44515693187713623,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1787.0,
+      "completions/mean_length": 1263.4375,
+      "completions/mean_terminated_length": 1043.760009765625,
+      "completions/min_length": 501.0,
+      "completions/min_terminated_length": 501.0,
+      "epoch": 0.088,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2545543611049652,
+      "learning_rate": 8.01636806561836e-07,
+      "loss": -0.0,
+      "num_tokens": 8939061.0,
+      "reward": 0.5484907031059265,
+      "reward_std": 0.48998576402664185,
+      "rewards/cosine_scaled_reward/mean": -0.13200464844703674,
+      "rewards/cosine_scaled_reward/std": 0.3430649936199188,
+      "rewards/format_reward/mean": 0.8125,
+      "rewards/format_reward/std": 0.39339789748191833,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1991.0,
+      "completions/mean_length": 1460.78125,
+      "completions/mean_terminated_length": 1059.0,
+      "completions/min_length": 430.0,
+      "completions/min_terminated_length": 430.0,
+      "epoch": 0.08914285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2583931088447571,
+      "learning_rate": 7.950875657567621e-07,
+      "loss": 0.0,
+      "num_tokens": 9043271.0,
+      "reward": 0.6075442433357239,
+      "reward_std": 0.6895643472671509,
+      "rewards/cosine_scaled_reward/mean": -0.0009153857827186584,
+      "rewards/cosine_scaled_reward/std": 0.48922818899154663,
+      "rewards/format_reward/mean": 0.609375,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1956.0,
+      "completions/mean_length": 1054.875,
+      "completions/mean_terminated_length": 892.3635864257812,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 0.09028571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29089078307151794,
+      "learning_rate": 7.884636689049422e-07,
+      "loss": 0.0,
+      "num_tokens": 9120879.0,
+      "reward": 0.6885831356048584,
+      "reward_std": 0.508629322052002,
+      "rewards/cosine_scaled_reward/mean": -0.09320840239524841,
+      "rewards/cosine_scaled_reward/std": 0.38835227489471436,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2037.0,
+      "completions/mean_length": 1399.046875,
+      "completions/mean_terminated_length": 1145.1087646484375,
+      "completions/min_length": 484.0,
+      "completions/min_terminated_length": 484.0,
+      "epoch": 0.09142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27458345890045166,
+      "learning_rate": 7.817671337095244e-07,
+      "loss": 0.0,
+      "num_tokens": 9220810.0,
+      "reward": 0.5549384355545044,
+      "reward_std": 0.7092134952545166,
+      "rewards/cosine_scaled_reward/mean": -0.09753081202507019,
+      "rewards/cosine_scaled_reward/std": 0.4125780463218689,
+      "rewards/format_reward/mean": 0.75,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1833.0,
+      "completions/mean_length": 1084.984375,
+      "completions/mean_terminated_length": 906.6481323242188,
+      "completions/min_length": 274.0,
+      "completions/min_terminated_length": 274.0,
+      "epoch": 0.09257142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.37247684597969055,
+      "learning_rate": 7.75e-07,
+      "loss": -0.0,
+      "num_tokens": 9301521.0,
+      "reward": 0.5357480049133301,
+      "reward_std": 0.5661624670028687,
+      "rewards/cosine_scaled_reward/mean": -0.18525099754333496,
+      "rewards/cosine_scaled_reward/std": 0.3385297954082489,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2046.0,
+      "completions/mean_length": 1260.921875,
+      "completions/mean_terminated_length": 998.5625,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "epoch": 0.09371428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27329322695732117,
+      "learning_rate": 7.681643291108517e-07,
+      "loss": -0.0,
+      "num_tokens": 9392548.0,
+      "reward": 0.9478914737701416,
+      "reward_std": 0.4313860237598419,
+      "rewards/cosine_scaled_reward/mean": 0.09894578158855438,
+      "rewards/cosine_scaled_reward/std": 0.5477120876312256,
+      "rewards/format_reward/mean": 0.75,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2039.0,
+      "completions/mean_length": 1309.671875,
+      "completions/mean_terminated_length": 922.9285888671875,
+      "completions/min_length": 303.0,
+      "completions/min_terminated_length": 303.0,
+      "epoch": 0.09485714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3202998638153076,
+      "learning_rate": 7.612622032536507e-07,
+      "loss": -0.0,
+      "num_tokens": 9487455.0,
+      "reward": 0.5201998949050903,
+      "reward_std": 0.6858996152877808,
+      "rewards/cosine_scaled_reward/mean": -0.09927503764629364,
+      "rewards/cosine_scaled_reward/std": 0.37909674644470215,
+      "rewards/format_reward/mean": 0.71875,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1685.0,
+      "completions/mean_length": 1185.703125,
+      "completions/mean_terminated_length": 965.9019775390625,
+      "completions/min_length": 390.0,
+      "completions/min_terminated_length": 390.0,
+      "epoch": 0.096,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29646041989326477,
+      "learning_rate": 7.54295724882796e-07,
+      "loss": -0.0,
+      "num_tokens": 9574036.0,
+      "reward": 0.6779025793075562,
+      "reward_std": 0.557724118232727,
+      "rewards/cosine_scaled_reward/mean": -0.09073619544506073,
+      "rewards/cosine_scaled_reward/std": 0.3855368196964264,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2021.0,
+      "completions/mean_length": 1297.828125,
+      "completions/mean_terminated_length": 1158.907470703125,
+      "completions/min_length": 601.0,
+      "completions/min_terminated_length": 601.0,
+      "epoch": 0.09714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21307455003261566,
+      "learning_rate": 7.472670160550848e-07,
+      "loss": 0.0,
+      "num_tokens": 9667417.0,
+      "reward": 0.5093189477920532,
+      "reward_std": 0.6006681323051453,
+      "rewards/cosine_scaled_reward/mean": -0.1672155261039734,
+      "rewards/cosine_scaled_reward/std": 0.34896284341812134,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1859.0,
+      "completions/mean_length": 1348.90625,
+      "completions/mean_terminated_length": 1096.04248046875,
+      "completions/min_length": 501.0,
+      "completions/min_terminated_length": 501.0,
+      "epoch": 0.09828571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2883393168449402,
+      "learning_rate": 7.401782177833147e-07,
+      "loss": -0.0,
+      "num_tokens": 9764603.0,
+      "reward": 0.8025823831558228,
+      "reward_std": 0.547119677066803,
+      "rewards/cosine_scaled_reward/mean": 0.01847870647907257,
+      "rewards/cosine_scaled_reward/std": 0.4346420168876648,
+      "rewards/format_reward/mean": 0.765625,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1782.0,
+      "completions/mean_length": 1086.96875,
+      "completions/mean_terminated_length": 909.0,
+      "completions/min_length": 350.0,
+      "completions/min_terminated_length": 350.0,
+      "epoch": 0.09942857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31888866424560547,
+      "learning_rate": 7.330314893841101e-07,
+      "loss": -0.0,
+      "num_tokens": 9844289.0,
+      "reward": 0.5533354878425598,
+      "reward_std": 0.5319498777389526,
+      "rewards/cosine_scaled_reward/mean": -0.1530197560787201,
+      "rewards/cosine_scaled_reward/std": 0.2434682846069336,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1984.0,
+      "completions/mean_length": 954.921875,
+      "completions/mean_terminated_length": 919.6612548828125,
+      "completions/min_length": 114.0,
+      "completions/min_terminated_length": 114.0,
+      "epoch": 0.10057142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3025936484336853,
+      "learning_rate": 7.258290078201731e-07,
+      "loss": -0.0,
+      "num_tokens": 9915916.0,
+      "reward": 1.2692296504974365,
+      "reward_std": 0.5115163326263428,
+      "rewards/cosine_scaled_reward/mean": 0.13461479544639587,
+      "rewards/cosine_scaled_reward/std": 0.506001353263855,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1926.0,
+      "completions/mean_length": 1351.8125,
+      "completions/mean_terminated_length": 1174.35302734375,
+      "completions/min_length": 650.0,
+      "completions/min_terminated_length": 650.0,
+      "epoch": 0.10171428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23423585295677185,
+      "learning_rate": 7.185729670371604e-07,
+      "loss": -0.0,
+      "num_tokens": 10013432.0,
+      "reward": 0.724889874458313,
+      "reward_std": 0.7425336837768555,
+      "rewards/cosine_scaled_reward/mean": -0.0828675627708435,
+      "rewards/cosine_scaled_reward/std": 0.3893774449825287,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1906.0,
+      "completions/mean_length": 1153.28125,
+      "completions/mean_terminated_length": 1025.46435546875,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "epoch": 0.10285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3860023021697998,
+      "learning_rate": 7.11265577295385e-07,
+      "loss": -0.0,
+      "num_tokens": 10097242.0,
+      "reward": 0.5000253915786743,
+      "reward_std": 0.5103108286857605,
+      "rewards/cosine_scaled_reward/mean": -0.18748730421066284,
+      "rewards/cosine_scaled_reward/std": 0.2787182629108429,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.328125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2032.0,
+      "completions/mean_length": 1455.484375,
+      "completions/mean_terminated_length": 1166.1163330078125,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "epoch": 0.104,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2551063895225525,
+      "learning_rate": 7.039090644965509e-07,
+      "loss": 0.0,
+      "num_tokens": 10200961.0,
+      "reward": 0.4053259789943695,
+      "reward_std": 0.663999617099762,
+      "rewards/cosine_scaled_reward/mean": -0.18796202540397644,
+      "rewards/cosine_scaled_reward/std": 0.35777655243873596,
+      "rewards/format_reward/mean": 0.78125,
+      "rewards/format_reward/std": 0.4166666865348816,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2004.0,
+      "completions/mean_length": 1176.953125,
+      "completions/mean_terminated_length": 1015.6481323242188,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "epoch": 0.10514285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27449366450309753,
+      "learning_rate": 6.965056695057204e-07,
+      "loss": -0.0,
+      "num_tokens": 10286278.0,
+      "reward": 0.5743436217308044,
+      "reward_std": 0.6229422092437744,
+      "rewards/cosine_scaled_reward/mean": -0.15032817423343658,
+      "rewards/cosine_scaled_reward/std": 0.2899566888809204,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2027.0,
+      "completions/mean_length": 1434.875,
+      "completions/mean_terminated_length": 1156.181884765625,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "epoch": 0.10628571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2839376926422119,
+      "learning_rate": 6.890576474687263e-07,
+      "loss": 0.0,
+      "num_tokens": 10389454.0,
+      "reward": 0.30658647418022156,
+      "reward_std": 0.5343226194381714,
+      "rewards/cosine_scaled_reward/mean": -0.22951926290988922,
+      "rewards/cosine_scaled_reward/std": 0.2324177473783493,
+      "rewards/format_reward/mean": 0.765625,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1684.0,
+      "completions/mean_length": 1242.390625,
+      "completions/mean_terminated_length": 927.1522216796875,
+      "completions/min_length": 508.0,
+      "completions/min_terminated_length": 508.0,
+      "epoch": 0.10742857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2985072433948517,
+      "learning_rate": 6.815672671252315e-07,
+      "loss": 0.0,
+      "num_tokens": 10478735.0,
+      "reward": 0.6593698263168335,
+      "reward_std": 0.5845412015914917,
+      "rewards/cosine_scaled_reward/mean": -0.02969011664390564,
+      "rewards/cosine_scaled_reward/std": 0.47056320309638977,
+      "rewards/format_reward/mean": 0.71875,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1931.0,
+      "completions/mean_length": 1203.265625,
+      "completions/mean_terminated_length": 1082.58935546875,
+      "completions/min_length": 573.0,
+      "completions/min_terminated_length": 573.0,
+      "epoch": 0.10857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2689598798751831,
+      "learning_rate": 6.740368101176495e-07,
+      "loss": 0.0,
+      "num_tokens": 10566272.0,
+      "reward": 0.4301251173019409,
+      "reward_std": 0.4795047640800476,
+      "rewards/cosine_scaled_reward/mean": -0.22243742644786835,
+      "rewards/cosine_scaled_reward/std": 0.2575407326221466,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1827.0,
+      "completions/mean_length": 1205.5625,
+      "completions/mean_terminated_length": 990.8235473632812,
+      "completions/min_length": 441.0,
+      "completions/min_terminated_length": 441.0,
+      "epoch": 0.10971428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30502915382385254,
+      "learning_rate": 6.664685702961344e-07,
+      "loss": -0.0,
+      "num_tokens": 10654564.0,
+      "reward": 0.896080493927002,
+      "reward_std": 0.6987663507461548,
+      "rewards/cosine_scaled_reward/mean": 0.02616523765027523,
+      "rewards/cosine_scaled_reward/std": 0.460237056016922,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1777.0,
+      "completions/mean_length": 1170.390625,
+      "completions/mean_terminated_length": 988.2453002929688,
+      "completions/min_length": 430.0,
+      "completions/min_terminated_length": 430.0,
+      "epoch": 0.11085714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3103901743888855,
+      "learning_rate": 6.588648530198504e-07,
+      "loss": -0.0,
+      "num_tokens": 10739733.0,
+      "reward": 0.6633297204971313,
+      "reward_std": 0.609075665473938,
+      "rewards/cosine_scaled_reward/mean": -0.12927262485027313,
+      "rewards/cosine_scaled_reward/std": 0.4114542305469513,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1817.0,
+      "completions/mean_length": 1136.5625,
+      "completions/mean_terminated_length": 947.396240234375,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "epoch": 0.112,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2510873079299927,
+      "learning_rate": 6.512279744547392e-07,
+      "loss": 0.0,
+      "num_tokens": 10823537.0,
+      "reward": 0.6613268256187439,
+      "reward_std": 0.4785424768924713,
+      "rewards/cosine_scaled_reward/mean": -0.09902409464120865,
+      "rewards/cosine_scaled_reward/std": 0.4345317482948303,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2031.0,
+      "completions/mean_length": 1171.8125,
+      "completions/mean_terminated_length": 1081.17236328125,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 0.11314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.281054824590683,
+      "learning_rate": 6.435602608679916e-07,
+      "loss": -0.0,
+      "num_tokens": 10909701.0,
+      "reward": 1.0416245460510254,
+      "reward_std": 0.6949809789657593,
+      "rewards/cosine_scaled_reward/mean": 0.0520622618496418,
+      "rewards/cosine_scaled_reward/std": 0.508481502532959,
+      "rewards/format_reward/mean": 0.9375,
+      "rewards/format_reward/std": 0.24397502839565277,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1935.0,
+      "completions/mean_length": 1120.8125,
+      "completions/mean_terminated_length": 1024.8966064453125,
+      "completions/min_length": 410.0,
+      "completions/min_terminated_length": 410.0,
+      "epoch": 0.11428571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2910788655281067,
+      "learning_rate": 6.358640479194451e-07,
+      "loss": 0.0,
+      "num_tokens": 10991145.0,
+      "reward": 1.2036188840866089,
+      "reward_std": 0.8533884286880493,
+      "rewards/cosine_scaled_reward/mean": 0.14087192714214325,
+      "rewards/cosine_scaled_reward/std": 0.5375887751579285,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 100
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 200,
+  "num_input_tokens_seen": 10991145,
+  "num_train_epochs": 1,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin
new file mode 100644
index 0000000..9e03ee7
--- /dev/null
+++ b/checkpoint-100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec4fbe7e29ae78abab9c9fd5555870c1bffe7656ceef4ac5fa2148a15b61b1e3
+size 8888
diff --git a/checkpoint-100/zero_to_fp32.py b/checkpoint-100/zero_to_fp32.py
new file mode 100644
index 0000000..0e75914
--- /dev/null
+++ b/checkpoint-100/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-150/config.json b/checkpoint-150/config.json
new file mode 100644
index 0000000..78fed5b
--- /dev/null
+++ b/checkpoint-150/config.json
@@ -0,0 +1,29 @@
+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000,
+  "sliding_window": 4096,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": false,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
diff --git a/checkpoint-150/generation_config.json b/checkpoint-150/generation_config.json
new file mode 100644
index 0000000..92878bd
--- /dev/null
+++ b/checkpoint-150/generation_config.json
@@ -0,0 +1,9 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 151646,
+  "do_sample": true,
+  "eos_token_id": 151643,
+  "temperature": 0.6,
+  "top_p": 0.95,
+  "transformers_version": "4.51.3"
+}
diff --git a/checkpoint-150/global_step150/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-150/global_step150/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000..13300da
--- /dev/null
+++ b/checkpoint-150/global_step150/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8263fa53cf56652267f39b5f0f25269cd883d7dd51a99633fd6cf76b7be76642
+size 5331274140
diff --git a/checkpoint-150/global_step150/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-150/global_step150/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000..8e48238
--- /dev/null
+++ b/checkpoint-150/global_step150/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff2755e67ca872d954992952904fa6aa172ed763e5dc041dcb6358e9ae584431
+size 5331276572
diff --git a/checkpoint-150/global_step150/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-150/global_step150/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000..0d06338
--- /dev/null
+++ b/checkpoint-150/global_step150/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db406d943ceefc64f936cccc352f18706c5b7067190422d4dc9b05b29ecd4adb
+size 5331276892
diff --git a/checkpoint-150/global_step150/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-150/global_step150/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000..fa196ed
--- /dev/null
+++ b/checkpoint-150/global_step150/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c8b0281b6b31e755fd279fdb11ddff5ad0366bb628bd0c6f686a0a5cb9fe9b9
+size 5331273884
diff --git a/checkpoint-150/global_step150/mp_rank_00_model_states.pt b/checkpoint-150/global_step150/mp_rank_00_model_states.pt
new file mode 100644
index 0000000..f6ec2c2
--- /dev/null
+++ b/checkpoint-150/global_step150/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd5b89bece86d35e0971e0feffeba21aedbf60967650a4c8721e2e1a63f04a72
+size 3554267640
diff --git a/checkpoint-150/latest b/checkpoint-150/latest
new file mode 100644
index 0000000..daf5be2
--- /dev/null
+++ b/checkpoint-150/latest
@@ -0,0 +1 @@
+global_step150
\ No newline at end of file
diff --git a/checkpoint-150/model.safetensors b/checkpoint-150/model.safetensors
new file mode 100644
index 0000000..204e87c
--- /dev/null
+++ b/checkpoint-150/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39aa333d4909dd2e57b05571e5c8a954a5813ad4f503a72afa6c10497b5f51e8
+size 3554214752
diff --git a/checkpoint-150/rng_state_0.pth b/checkpoint-150/rng_state_0.pth
new file mode 100644
index 0000000..408a873
--- /dev/null
+++ b/checkpoint-150/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96d867261441e1e1663c114f0b6b75f90d9ae6dcdb00127b0edf349cc603361b
+size 14960
diff --git a/checkpoint-150/rng_state_1.pth b/checkpoint-150/rng_state_1.pth
new file mode 100644
index 0000000..c6550cc
--- /dev/null
+++ b/checkpoint-150/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:626697b4919a36bf118090898dd7854d355fe644652e7e94c1d8164c54600ce7
+size 14960
diff --git a/checkpoint-150/rng_state_2.pth b/checkpoint-150/rng_state_2.pth
new file mode 100644
index 0000000..3926a7e
--- /dev/null
+++ b/checkpoint-150/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b78f639ba401ba28686266f62024e2c2c981bcc01eed221e9ab333057c17938e
+size 14960
diff --git a/checkpoint-150/rng_state_3.pth b/checkpoint-150/rng_state_3.pth
new file mode 100644
index 0000000..0a74a91
--- /dev/null
+++ b/checkpoint-150/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4875d2494078536392718e5bfe242a49f416663b092d0de6bae0f1b7dccaf452
+size 14960
diff --git a/checkpoint-150/scheduler.pt b/checkpoint-150/scheduler.pt
new file mode 100644
index 0000000..dd6d7b3
--- /dev/null
+++ b/checkpoint-150/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:652df553f3c9234b0e74dc48466c4ec0ca48a5ae2c6acd4d2c81ea9542491c84
+size 1064
diff --git a/checkpoint-150/special_tokens_map.json b/checkpoint-150/special_tokens_map.json
new file mode 100644
index 0000000..1d385d6
--- /dev/null
+++ b/checkpoint-150/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/checkpoint-150/tokenizer.json b/checkpoint-150/tokenizer.json
new file mode 100644
index 0000000..e7cd2c1
--- /dev/null
+++ b/checkpoint-150/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a
+size 11422959
diff --git a/checkpoint-150/tokenizer_config.json b/checkpoint-150/tokenizer_config.json
new file mode 100644
index 0000000..ef6e98c
--- /dev/null
+++ b/checkpoint-150/tokenizer_config.json
@@ -0,0 +1,195 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\\n'}}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/checkpoint-150/trainer_state.json b/checkpoint-150/trainer_state.json
new file mode 100644
index 0000000..2fe6193
--- /dev/null
+++ b/checkpoint-150/trainer_state.json
@@ -0,0 +1,4084 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.17142857142857143,
+  "eval_steps": 500,
+  "global_step": 150,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1734.0,
+      "completions/mean_length": 1702.03125,
+      "completions/mean_terminated_length": 993.6190795898438,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "epoch": 0.001142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2544386684894562,
+      "learning_rate": 0.0,
+      "loss": -0.0,
+      "num_tokens": 118418.0,
+      "reward": 0.17899775505065918,
+      "reward_std": 0.7650213241577148,
+      "rewards/cosine_scaled_reward/mean": -0.09800112992525101,
+      "rewards/cosine_scaled_reward/std": 0.37953105568885803,
+      "rewards/format_reward/mean": 0.375,
+      "rewards/format_reward/std": 0.48795005679130554,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1894.0,
+      "completions/mean_length": 1738.90625,
+      "completions/mean_terminated_length": 949.0,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "epoch": 0.002285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2436082512140274,
+      "learning_rate": 5e-08,
+      "loss": -0.0,
+      "num_tokens": 239748.0,
+      "reward": 0.3848632574081421,
+      "reward_std": 0.9111153483390808,
+      "rewards/cosine_scaled_reward/mean": 0.020556632429361343,
+      "rewards/cosine_scaled_reward/std": 0.4492928683757782,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1545.0,
+      "completions/mean_length": 1989.015625,
+      "completions/mean_terminated_length": 1104.25,
+      "completions/min_length": 706.0,
+      "completions/min_terminated_length": 706.0,
+      "epoch": 0.0034285714285714284,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2544717788696289,
+      "learning_rate": 1e-07,
+      "loss": -0.0,
+      "num_tokens": 377517.0,
+      "reward": -0.3279358148574829,
+      "reward_std": 0.33216947317123413,
+      "rewards/cosine_scaled_reward/mean": -0.20303040742874146,
+      "rewards/cosine_scaled_reward/std": 0.179075226187706,
+      "rewards/format_reward/mean": 0.078125,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2033.0,
+      "completions/mean_length": 1566.421875,
+      "completions/mean_terminated_length": 1084.84375,
+      "completions/min_length": 502.0,
+      "completions/min_terminated_length": 502.0,
+      "epoch": 0.004571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28807103633880615,
+      "learning_rate": 1.5e-07,
+      "loss": -0.0,
+      "num_tokens": 487576.0,
+      "reward": 0.2716121971607208,
+      "reward_std": 0.6643469333648682,
+      "rewards/cosine_scaled_reward/mean": -0.12981891632080078,
+      "rewards/cosine_scaled_reward/std": 0.3019586503505707,
+      "rewards/format_reward/mean": 0.53125,
+      "rewards/format_reward/std": 0.5029674172401428,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.890625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1807.0,
+      "completions/mean_length": 1936.84375,
+      "completions/mean_terminated_length": 1031.71435546875,
+      "completions/min_length": 463.0,
+      "completions/min_terminated_length": 463.0,
+      "epoch": 0.005714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26783761382102966,
+      "learning_rate": 2e-07,
+      "loss": -0.0,
+      "num_tokens": 622350.0,
+      "reward": -0.3612896800041199,
+      "reward_std": 0.41048353910446167,
+      "rewards/cosine_scaled_reward/mean": -0.23533234000205994,
+      "rewards/cosine_scaled_reward/std": 0.20467400550842285,
+      "rewards/format_reward/mean": 0.109375,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1301.0,
+      "completions/mean_length": 1889.453125,
+      "completions/mean_terminated_length": 779.625,
+      "completions/min_length": 530.0,
+      "completions/min_terminated_length": 530.0,
+      "epoch": 0.006857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.262518972158432,
+      "learning_rate": 2.5e-07,
+      "loss": 0.0,
+      "num_tokens": 754923.0,
+      "reward": -0.29250282049179077,
+      "reward_std": 0.5422531962394714,
+      "rewards/cosine_scaled_reward/mean": -0.22437641024589539,
+      "rewards/cosine_scaled_reward/std": 0.22509199380874634,
+      "rewards/format_reward/mean": 0.15625,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1568.0,
+      "completions/mean_length": 1921.921875,
+      "completions/mean_terminated_length": 1314.45458984375,
+      "completions/min_length": 927.0,
+      "completions/min_terminated_length": 927.0,
+      "epoch": 0.008,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22601397335529327,
+      "learning_rate": 3e-07,
+      "loss": 0.0,
+      "num_tokens": 888334.0,
+      "reward": 0.025340259075164795,
+      "reward_std": 0.7285393476486206,
+      "rewards/cosine_scaled_reward/mean": -0.1279548704624176,
+      "rewards/cosine_scaled_reward/std": 0.40222346782684326,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2009.0,
+      "completions/mean_length": 1736.859375,
+      "completions/mean_terminated_length": 999.9473876953125,
+      "completions/min_length": 305.0,
+      "completions/min_terminated_length": 305.0,
+      "epoch": 0.009142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24552854895591736,
+      "learning_rate": 3.5e-07,
+      "loss": 0.0,
+      "num_tokens": 1009909.0,
+      "reward": 0.21729671955108643,
+      "reward_std": 0.6989120244979858,
+      "rewards/cosine_scaled_reward/mean": -0.055414143949747086,
+      "rewards/cosine_scaled_reward/std": 0.47493892908096313,
+      "rewards/format_reward/mean": 0.328125,
+      "rewards/format_reward/std": 0.4732423722743988,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1963.0,
+      "completions/mean_length": 1967.53125,
+      "completions/mean_terminated_length": 1475.77783203125,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "epoch": 0.010285714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2430322915315628,
+      "learning_rate": 4e-07,
+      "loss": 0.0,
+      "num_tokens": 1147287.0,
+      "reward": -0.21451422572135925,
+      "reward_std": 0.587526798248291,
+      "rewards/cosine_scaled_reward/mean": -0.19319462776184082,
+      "rewards/cosine_scaled_reward/std": 0.29357606172561646,
+      "rewards/format_reward/mean": 0.171875,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1966.0,
+      "completions/mean_length": 1708.546875,
+      "completions/mean_terminated_length": 961.75,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "epoch": 0.011428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2543582320213318,
+      "learning_rate": 4.5e-07,
+      "loss": 0.0,
+      "num_tokens": 1267466.0,
+      "reward": 0.02539752423763275,
+      "reward_std": 0.545810341835022,
+      "rewards/cosine_scaled_reward/mean": -0.14355123043060303,
+      "rewards/cosine_scaled_reward/std": 0.36147356033325195,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.90625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1579.0,
+      "completions/mean_length": 1967.734375,
+      "completions/mean_terminated_length": 1191.8333740234375,
+      "completions/min_length": 843.0,
+      "completions/min_terminated_length": 843.0,
+      "epoch": 0.012571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24583907425403595,
+      "learning_rate": 5e-07,
+      "loss": -0.0,
+      "num_tokens": 1405073.0,
+      "reward": -0.46971434354782104,
+      "reward_std": 0.36104393005371094,
+      "rewards/cosine_scaled_reward/mean": -0.28173214197158813,
+      "rewards/cosine_scaled_reward/std": 0.17775526642799377,
+      "rewards/format_reward/mean": 0.09375,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1862.0,
+      "completions/mean_length": 1707.5625,
+      "completions/mean_terminated_length": 1176.47998046875,
+      "completions/min_length": 330.0,
+      "completions/min_terminated_length": 330.0,
+      "epoch": 0.013714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3135142922401428,
+      "learning_rate": 5.5e-07,
+      "loss": -0.0,
+      "num_tokens": 1525301.0,
+      "reward": 0.0018395520746707916,
+      "reward_std": 0.7012988328933716,
+      "rewards/cosine_scaled_reward/mean": -0.21783021092414856,
+      "rewards/cosine_scaled_reward/std": 0.324150949716568,
+      "rewards/format_reward/mean": 0.4375,
+      "rewards/format_reward/std": 0.5,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1745.0,
+      "completions/mean_length": 1841.96875,
+      "completions/mean_terminated_length": 1168.933349609375,
+      "completions/min_length": 442.0,
+      "completions/min_terminated_length": 442.0,
+      "epoch": 0.014857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2532394826412201,
+      "learning_rate": 6e-07,
+      "loss": -0.0,
+      "num_tokens": 1654227.0,
+      "reward": -0.10322706401348114,
+      "reward_std": 0.6915165185928345,
+      "rewards/cosine_scaled_reward/mean": -0.17661353945732117,
+      "rewards/cosine_scaled_reward/std": 0.329875111579895,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2017.0,
+      "completions/mean_length": 1816.390625,
+      "completions/mean_terminated_length": 1306.8499755859375,
+      "completions/min_length": 520.0,
+      "completions/min_terminated_length": 520.0,
+      "epoch": 0.016,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28405147790908813,
+      "learning_rate": 6.5e-07,
+      "loss": 0.0,
+      "num_tokens": 1781084.0,
+      "reward": 0.10602855682373047,
+      "reward_std": 0.630502462387085,
+      "rewards/cosine_scaled_reward/mean": -0.11104822158813477,
+      "rewards/cosine_scaled_reward/std": 0.3846627473831177,
+      "rewards/format_reward/mean": 0.328125,
+      "rewards/format_reward/std": 0.4732423722743988,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1574.0,
+      "completions/mean_length": 1702.109375,
+      "completions/mean_terminated_length": 818.1666870117188,
+      "completions/min_length": 411.0,
+      "completions/min_terminated_length": 411.0,
+      "epoch": 0.017142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28779250383377075,
+      "learning_rate": 7e-07,
+      "loss": 0.0,
+      "num_tokens": 1900939.0,
+      "reward": 0.32734519243240356,
+      "reward_std": 0.3870265483856201,
+      "rewards/cosine_scaled_reward/mean": 0.007422588765621185,
+      "rewards/cosine_scaled_reward/std": 0.45787373185157776,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 2048.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 2048.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.018285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2337152510881424,
+      "learning_rate": 7.5e-07,
+      "loss": -0.0,
+      "num_tokens": 2042451.0,
+      "reward": -0.5429925918579102,
+      "reward_std": 0.3153150975704193,
+      "rewards/cosine_scaled_reward/mean": -0.2714962661266327,
+      "rewards/cosine_scaled_reward/std": 0.1678173691034317,
+      "rewards/format_reward/mean": 0.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1879.0,
+      "completions/mean_length": 1564.921875,
+      "completions/mean_terminated_length": 858.8846435546875,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "epoch": 0.019428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33599403500556946,
+      "learning_rate": 8e-07,
+      "loss": -0.0,
+      "num_tokens": 2153126.0,
+      "reward": 0.17696775496006012,
+      "reward_std": 0.6489306688308716,
+      "rewards/cosine_scaled_reward/mean": -0.11464111506938934,
+      "rewards/cosine_scaled_reward/std": 0.3551919758319855,
+      "rewards/format_reward/mean": 0.40625,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1298.0,
+      "completions/mean_length": 1795.390625,
+      "completions/mean_terminated_length": 893.21435546875,
+      "completions/min_length": 619.0,
+      "completions/min_terminated_length": 619.0,
+      "epoch": 0.02057142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22697053849697113,
+      "learning_rate": 8.499999999999999e-07,
+      "loss": -0.0,
+      "num_tokens": 2278407.0,
+      "reward": -0.10711958259344101,
+      "reward_std": 0.5238703489303589,
+      "rewards/cosine_scaled_reward/mean": -0.1785597801208496,
+      "rewards/cosine_scaled_reward/std": 0.2545098662376404,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1949.0,
+      "completions/mean_length": 1921.484375,
+      "completions/mean_terminated_length": 1238.300048828125,
+      "completions/min_length": 623.0,
+      "completions/min_terminated_length": 623.0,
+      "epoch": 0.021714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23972108960151672,
+      "learning_rate": 9e-07,
+      "loss": 0.0,
+      "num_tokens": 2412638.0,
+      "reward": 0.029344379901885986,
+      "reward_std": 0.6719281077384949,
+      "rewards/cosine_scaled_reward/mean": -0.086890310049057,
+      "rewards/cosine_scaled_reward/std": 0.40220555663108826,
+      "rewards/format_reward/mean": 0.203125,
+      "rewards/format_reward/std": 0.40550529956817627,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2025.0,
+      "completions/mean_length": 1728.5625,
+      "completions/mean_terminated_length": 845.4117431640625,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "epoch": 0.022857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23309311270713806,
+      "learning_rate": 9.499999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 2534618.0,
+      "reward": 0.0131673663854599,
+      "reward_std": 0.4436222314834595,
+      "rewards/cosine_scaled_reward/mean": -0.13404130935668945,
+      "rewards/cosine_scaled_reward/std": 0.32819250226020813,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1923.0,
+      "completions/mean_length": 1777.953125,
+      "completions/mean_terminated_length": 1087.8333740234375,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "epoch": 0.024,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29990270733833313,
+      "learning_rate": 1e-06,
+      "loss": -0.0,
+      "num_tokens": 2659215.0,
+      "reward": -0.1764472872018814,
+      "reward_std": 0.5121938586235046,
+      "rewards/cosine_scaled_reward/mean": -0.2444736361503601,
+      "rewards/cosine_scaled_reward/std": 0.289971262216568,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.390625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1935.0,
+      "completions/mean_length": 1361.28125,
+      "completions/mean_terminated_length": 921.0769653320312,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 0.025142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29922786355018616,
+      "learning_rate": 9.99931462820376e-07,
+      "loss": -0.0,
+      "num_tokens": 2755353.0,
+      "reward": 0.6089149713516235,
+      "reward_std": 0.5986809730529785,
+      "rewards/cosine_scaled_reward/mean": -0.05491749942302704,
+      "rewards/cosine_scaled_reward/std": 0.39076483249664307,
+      "rewards/format_reward/mean": 0.71875,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2043.0,
+      "completions/mean_length": 1565.046875,
+      "completions/mean_terminated_length": 903.2222290039062,
+      "completions/min_length": 405.0,
+      "completions/min_terminated_length": 405.0,
+      "epoch": 0.026285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27512773871421814,
+      "learning_rate": 9.997258721585931e-07,
+      "loss": -0.0,
+      "num_tokens": 2866308.0,
+      "reward": 0.21871733665466309,
+      "reward_std": 0.5976030826568604,
+      "rewards/cosine_scaled_reward/mean": -0.10157884657382965,
+      "rewards/cosine_scaled_reward/std": 0.3856185972690582,
+      "rewards/format_reward/mean": 0.421875,
+      "rewards/format_reward/std": 0.49776285886764526,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1947.0,
+      "completions/mean_length": 1801.671875,
+      "completions/mean_terminated_length": 1259.75,
+      "completions/min_length": 573.0,
+      "completions/min_terminated_length": 573.0,
+      "epoch": 0.027428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22642865777015686,
+      "learning_rate": 9.993832906395582e-07,
+      "loss": -0.0,
+      "num_tokens": 2992543.0,
+      "reward": 0.04899948835372925,
+      "reward_std": 0.8525694608688354,
+      "rewards/cosine_scaled_reward/mean": -0.17081275582313538,
+      "rewards/cosine_scaled_reward/std": 0.3993513882160187,
+      "rewards/format_reward/mean": 0.390625,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2020.0,
+      "completions/mean_length": 1715.765625,
+      "completions/mean_terminated_length": 1035.4761962890625,
+      "completions/min_length": 436.0,
+      "completions/min_terminated_length": 436.0,
+      "epoch": 0.02857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25316134095191956,
+      "learning_rate": 9.989038226169207e-07,
+      "loss": -0.0,
+      "num_tokens": 3112648.0,
+      "reward": 0.10585837811231613,
+      "reward_std": 0.7828943729400635,
+      "rewards/cosine_scaled_reward/mean": -0.11894579976797104,
+      "rewards/cosine_scaled_reward/std": 0.4141720235347748,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1964.0,
+      "completions/mean_length": 1917.703125,
+      "completions/mean_terminated_length": 1452.357177734375,
+      "completions/min_length": 840.0,
+      "completions/min_terminated_length": 840.0,
+      "epoch": 0.029714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2521306574344635,
+      "learning_rate": 9.982876141412855e-07,
+      "loss": -0.0,
+      "num_tokens": 3246013.0,
+      "reward": 0.17620250582695007,
+      "reward_std": 0.6548349857330322,
+      "rewards/cosine_scaled_reward/mean": -0.08377375453710556,
+      "rewards/cosine_scaled_reward/std": 0.3527655303478241,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1990.0,
+      "completions/mean_length": 1851.015625,
+      "completions/mean_terminated_length": 1147.5,
+      "completions/min_length": 506.0,
+      "completions/min_terminated_length": 506.0,
+      "epoch": 0.030857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2730060815811157,
+      "learning_rate": 9.975348529157229e-07,
+      "loss": -0.0,
+      "num_tokens": 3374766.0,
+      "reward": -0.18854813277721405,
+      "reward_std": 0.49348777532577515,
+      "rewards/cosine_scaled_reward/mean": -0.21146157383918762,
+      "rewards/cosine_scaled_reward/std": 0.2601618766784668,
+      "rewards/format_reward/mean": 0.234375,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2023.0,
+      "completions/mean_length": 1798.328125,
+      "completions/mean_terminated_length": 1049.3125,
+      "completions/min_length": 403.0,
+      "completions/min_terminated_length": 403.0,
+      "epoch": 0.032,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2566036880016327,
+      "learning_rate": 9.96645768238595e-07,
+      "loss": 0.0,
+      "num_tokens": 3500195.0,
+      "reward": 0.06705980002880096,
+      "reward_std": 0.7090284824371338,
+      "rewards/cosine_scaled_reward/mean": -0.10709509253501892,
+      "rewards/cosine_scaled_reward/std": 0.4101051986217499,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1988.0,
+      "completions/mean_length": 1930.203125,
+      "completions/mean_terminated_length": 1210.3333740234375,
+      "completions/min_length": 582.0,
+      "completions/min_terminated_length": 582.0,
+      "epoch": 0.03314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25197461247444153,
+      "learning_rate": 9.956206309337066e-07,
+      "loss": 0.0,
+      "num_tokens": 3634200.0,
+      "reward": -0.2462695688009262,
+      "reward_std": 0.5237302780151367,
+      "rewards/cosine_scaled_reward/mean": -0.2012597918510437,
+      "rewards/cosine_scaled_reward/std": 0.23252712190151215,
+      "rewards/format_reward/mean": 0.15625,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1900.0,
+      "completions/mean_length": 1847.65625,
+      "completions/mean_terminated_length": 1061.6923828125,
+      "completions/min_length": 421.0,
+      "completions/min_terminated_length": 421.0,
+      "epoch": 0.03428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30431485176086426,
+      "learning_rate": 9.944597532678119e-07,
+      "loss": 0.0,
+      "num_tokens": 3762986.0,
+      "reward": -0.05392302945256233,
+      "reward_std": 0.7249555587768555,
+      "rewards/cosine_scaled_reward/mean": -0.15196150541305542,
+      "rewards/cosine_scaled_reward/std": 0.34566983580589294,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1860.0,
+      "completions/mean_length": 1838.671875,
+      "completions/mean_terminated_length": 931.5833740234375,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "epoch": 0.03542857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2484513372182846,
+      "learning_rate": 9.931634888554935e-07,
+      "loss": 0.0,
+      "num_tokens": 3891157.0,
+      "reward": -0.11271396279335022,
+      "reward_std": 0.6705260872840881,
+      "rewards/cosine_scaled_reward/mean": -0.1813569962978363,
+      "rewards/cosine_scaled_reward/std": 0.4071698486804962,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1715.0,
+      "completions/mean_length": 1910.109375,
+      "completions/mean_terminated_length": 1417.6429443359375,
+      "completions/min_length": 906.0,
+      "completions/min_terminated_length": 906.0,
+      "epoch": 0.036571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25329527258872986,
+      "learning_rate": 9.917322325514487e-07,
+      "loss": -0.0,
+      "num_tokens": 4023756.0,
+      "reward": -0.08931556344032288,
+      "reward_std": 0.6381070613861084,
+      "rewards/cosine_scaled_reward/mean": -0.16965776681900024,
+      "rewards/cosine_scaled_reward/std": 0.37385129928588867,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.953125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1865.0,
+      "completions/mean_length": 2023.71875,
+      "completions/mean_terminated_length": 1530.0,
+      "completions/min_length": 1107.0,
+      "completions/min_terminated_length": 1107.0,
+      "epoch": 0.037714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22758109867572784,
+      "learning_rate": 9.901664203302124e-07,
+      "loss": 0.0,
+      "num_tokens": 4164490.0,
+      "reward": -0.4589868187904358,
+      "reward_std": 0.5177067518234253,
+      "rewards/cosine_scaled_reward/mean": -0.2919934093952179,
+      "rewards/cosine_scaled_reward/std": 0.2252870500087738,
+      "rewards/format_reward/mean": 0.125,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.453125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1999.0,
+      "completions/mean_length": 1454.78125,
+      "completions/mean_terminated_length": 963.2571411132812,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "epoch": 0.038857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3234354257583618,
+      "learning_rate": 9.88466529153356e-07,
+      "loss": 0.0,
+      "num_tokens": 4267148.0,
+      "reward": 0.656031608581543,
+      "reward_std": 0.7529654502868652,
+      "rewards/cosine_scaled_reward/mean": 0.05457830801606178,
+      "rewards/cosine_scaled_reward/std": 0.49684229493141174,
+      "rewards/format_reward/mean": 0.546875,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1724.0,
+      "completions/mean_length": 1819.078125,
+      "completions/mean_terminated_length": 716.0909423828125,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "epoch": 0.04,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2821458876132965,
+      "learning_rate": 9.866330768241983e-07,
+      "loss": -0.0,
+      "num_tokens": 4395065.0,
+      "reward": -0.09630556404590607,
+      "reward_std": 0.7089139223098755,
+      "rewards/cosine_scaled_reward/mean": -0.15752778947353363,
+      "rewards/cosine_scaled_reward/std": 0.3647947609424591,
+      "rewards/format_reward/mean": 0.21875,
+      "rewards/format_reward/std": 0.4166666865348816,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1811.0,
+      "completions/mean_length": 1954.34375,
+      "completions/mean_terminated_length": 1382.0,
+      "completions/min_length": 949.0,
+      "completions/min_terminated_length": 949.0,
+      "epoch": 0.04114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24163897335529327,
+      "learning_rate": 9.846666218300807e-07,
+      "loss": -0.0,
+      "num_tokens": 4531255.0,
+      "reward": -0.34593287110328674,
+      "reward_std": 0.44493502378463745,
+      "rewards/cosine_scaled_reward/mean": -0.24327893555164337,
+      "rewards/cosine_scaled_reward/std": 0.24784433841705322,
+      "rewards/format_reward/mean": 0.140625,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1723.0,
+      "completions/mean_length": 1868.921875,
+      "completions/mean_terminated_length": 1092.916748046875,
+      "completions/min_length": 620.0,
+      "completions/min_terminated_length": 620.0,
+      "epoch": 0.04228571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24795544147491455,
+      "learning_rate": 9.825677631722435e-07,
+      "loss": -0.0,
+      "num_tokens": 4661890.0,
+      "reward": -0.23053905367851257,
+      "reward_std": 0.34036368131637573,
+      "rewards/cosine_scaled_reward/mean": -0.2246445268392563,
+      "rewards/cosine_scaled_reward/std": 0.15942412614822388,
+      "rewards/format_reward/mean": 0.21875,
+      "rewards/format_reward/std": 0.4166666865348816,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1397.0,
+      "completions/mean_length": 1889.53125,
+      "completions/mean_terminated_length": 1033.800048828125,
+      "completions/min_length": 810.0,
+      "completions/min_terminated_length": 810.0,
+      "epoch": 0.04342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24283826351165771,
+      "learning_rate": 9.80337140183366e-07,
+      "loss": 0.0,
+      "num_tokens": 4794532.0,
+      "reward": -0.10043507814407349,
+      "reward_std": 0.47925832867622375,
+      "rewards/cosine_scaled_reward/mean": -0.13615503907203674,
+      "rewards/cosine_scaled_reward/std": 0.3336707651615143,
+      "rewards/format_reward/mean": 0.171875,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1515.0,
+      "completions/mean_length": 1644.828125,
+      "completions/mean_terminated_length": 689.9473876953125,
+      "completions/min_length": 279.0,
+      "completions/min_terminated_length": 279.0,
+      "epoch": 0.044571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28362998366355896,
+      "learning_rate": 9.779754323328192e-07,
+      "loss": 0.0,
+      "num_tokens": 4910585.0,
+      "reward": 0.12284853309392929,
+      "reward_std": 0.4183085858821869,
+      "rewards/cosine_scaled_reward/mean": -0.11045074462890625,
+      "rewards/cosine_scaled_reward/std": 0.30217844247817993,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1632.0,
+      "completions/mean_length": 1618.28125,
+      "completions/mean_terminated_length": 902.0833740234375,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "epoch": 0.045714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.262617826461792,
+      "learning_rate": 9.754833590196926e-07,
+      "loss": 0.0,
+      "num_tokens": 5024227.0,
+      "reward": 0.2076582908630371,
+      "reward_std": 0.42125773429870605,
+      "rewards/cosine_scaled_reward/mean": -0.12273336946964264,
+      "rewards/cosine_scaled_reward/std": 0.4404613971710205,
+      "rewards/format_reward/mean": 0.453125,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1914.0,
+      "completions/mean_length": 1717.734375,
+      "completions/mean_terminated_length": 1235.0384521484375,
+      "completions/min_length": 664.0,
+      "completions/min_terminated_length": 664.0,
+      "epoch": 0.046857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23294499516487122,
+      "learning_rate": 9.728616793536587e-07,
+      "loss": -0.0,
+      "num_tokens": 5145314.0,
+      "reward": 0.011502981185913086,
+      "reward_std": 0.6816084980964661,
+      "rewards/cosine_scaled_reward/mean": -0.22081100940704346,
+      "rewards/cosine_scaled_reward/std": 0.37589573860168457,
+      "rewards/format_reward/mean": 0.453125,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1672.0,
+      "completions/mean_length": 1703.921875,
+      "completions/mean_terminated_length": 579.933349609375,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 0.048,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34672290086746216,
+      "learning_rate": 9.701111919237408e-07,
+      "loss": -0.0,
+      "num_tokens": 5264725.0,
+      "reward": -0.2616002857685089,
+      "reward_std": 0.37952175736427307,
+      "rewards/cosine_scaled_reward/mean": -0.26361262798309326,
+      "rewards/cosine_scaled_reward/std": 0.17531204223632812,
+      "rewards/format_reward/mean": 0.265625,
+      "rewards/format_reward/std": 0.44515693187713623,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1370.0,
+      "completions/mean_length": 1681.84375,
+      "completions/mean_terminated_length": 814.631591796875,
+      "completions/min_length": 308.0,
+      "completions/min_terminated_length": 308.0,
+      "epoch": 0.04914285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.263967901468277,
+      "learning_rate": 9.672327345550543e-07,
+      "loss": -0.0,
+      "num_tokens": 5383979.0,
+      "reward": 0.13376155495643616,
+      "reward_std": 0.46012288331985474,
+      "rewards/cosine_scaled_reward/mean": -0.08155670762062073,
+      "rewards/cosine_scaled_reward/std": 0.3612325191497803,
+      "rewards/format_reward/mean": 0.296875,
+      "rewards/format_reward/std": 0.4604927599430084,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.640625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1830.0,
+      "completions/mean_length": 1624.625,
+      "completions/mean_terminated_length": 869.9130859375,
+      "completions/min_length": 385.0,
+      "completions/min_terminated_length": 385.0,
+      "epoch": 0.05028571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28927963972091675,
+      "learning_rate": 9.64227184053598e-07,
+      "loss": -0.0,
+      "num_tokens": 5498651.0,
+      "reward": 0.20869271457195282,
+      "reward_std": 0.5558150410652161,
+      "rewards/cosine_scaled_reward/mean": -0.0987786278128624,
+      "rewards/cosine_scaled_reward/std": 0.42912590503692627,
+      "rewards/format_reward/mean": 0.40625,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.921875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1851.0,
+      "completions/mean_length": 2006.96875,
+      "completions/mean_terminated_length": 1522.800048828125,
+      "completions/min_length": 955.0,
+      "completions/min_terminated_length": 955.0,
+      "epoch": 0.05142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24254000186920166,
+      "learning_rate": 9.610954559391704e-07,
+      "loss": 0.0,
+      "num_tokens": 5638753.0,
+      "reward": -0.2540697157382965,
+      "reward_std": 0.4600578844547272,
+      "rewards/cosine_scaled_reward/mean": -0.20515984296798706,
+      "rewards/cosine_scaled_reward/std": 0.3251590430736542,
+      "rewards/format_reward/mean": 0.15625,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1563.0,
+      "completions/mean_length": 1765.984375,
+      "completions/mean_terminated_length": 919.9375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "epoch": 0.052571428571428575,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2645930349826813,
+      "learning_rate": 9.578385041664925e-07,
+      "loss": 0.0,
+      "num_tokens": 5762944.0,
+      "reward": -0.213707834482193,
+      "reward_std": 0.38778313994407654,
+      "rewards/cosine_scaled_reward/mean": -0.2318539321422577,
+      "rewards/cosine_scaled_reward/std": 0.21436986327171326,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1993.0,
+      "completions/mean_length": 1583.40625,
+      "completions/mean_terminated_length": 986.0714721679688,
+      "completions/min_length": 436.0,
+      "completions/min_terminated_length": 436.0,
+      "epoch": 0.053714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.311797559261322,
+      "learning_rate": 9.54457320834625e-07,
+      "loss": 0.0,
+      "num_tokens": 5874682.0,
+      "reward": 0.27925533056259155,
+      "reward_std": 0.6467443704605103,
+      "rewards/cosine_scaled_reward/mean": -0.07912233471870422,
+      "rewards/cosine_scaled_reward/std": 0.4737093150615692,
+      "rewards/format_reward/mean": 0.4375,
+      "rewards/format_reward/std": 0.5,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1527.0,
+      "completions/mean_length": 1690.0625,
+      "completions/mean_terminated_length": 1006.727294921875,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "epoch": 0.054857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26644304394721985,
+      "learning_rate": 9.509529358847654e-07,
+      "loss": -0.0,
+      "num_tokens": 5993390.0,
+      "reward": 0.13692031800746918,
+      "reward_std": 0.5655145049095154,
+      "rewards/cosine_scaled_reward/mean": -0.12685233354568481,
+      "rewards/cosine_scaled_reward/std": 0.32320985198020935,
+      "rewards/format_reward/mean": 0.390625,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2021.0,
+      "completions/mean_length": 1387.140625,
+      "completions/mean_terminated_length": 804.0294189453125,
+      "completions/min_length": 300.0,
+      "completions/min_terminated_length": 300.0,
+      "epoch": 0.056,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3078882396221161,
+      "learning_rate": 9.473264167865171e-07,
+      "loss": 0.0,
+      "num_tokens": 6092231.0,
+      "reward": 0.35559189319610596,
+      "reward_std": 0.5927403569221497,
+      "rewards/cosine_scaled_reward/mean": -0.09564155340194702,
+      "rewards/cosine_scaled_reward/std": 0.4046906530857086,
+      "rewards/format_reward/mean": 0.546875,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1830.0,
+      "completions/mean_length": 1674.890625,
+      "completions/mean_terminated_length": 962.5909423828125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "epoch": 0.05714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23925544321537018,
+      "learning_rate": 9.43578868212728e-07,
+      "loss": -0.0,
+      "num_tokens": 6210240.0,
+      "reward": 0.18573230504989624,
+      "reward_std": 0.5264967083930969,
+      "rewards/cosine_scaled_reward/mean": -0.09463384002447128,
+      "rewards/cosine_scaled_reward/std": 0.4100942015647888,
+      "rewards/format_reward/mean": 0.375,
+      "rewards/format_reward/std": 0.48795005679130554,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.421875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2024.0,
+      "completions/mean_length": 1347.40625,
+      "completions/mean_terminated_length": 836.1621704101562,
+      "completions/min_length": 394.0,
+      "completions/min_terminated_length": 394.0,
+      "epoch": 0.05828571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.325811505317688,
+      "learning_rate": 9.397114317029974e-07,
+      "loss": 0.0,
+      "num_tokens": 6306682.0,
+      "reward": 0.1735648661851883,
+      "reward_std": 0.5335988998413086,
+      "rewards/cosine_scaled_reward/mean": -0.21009255945682526,
+      "rewards/cosine_scaled_reward/std": 0.2623959481716156,
+      "rewards/format_reward/mean": 0.59375,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1390.0,
+      "completions/mean_length": 1727.765625,
+      "completions/mean_terminated_length": 767.0625,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 0.05942857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27392977476119995,
+      "learning_rate": 9.357252853159505e-07,
+      "loss": 0.0,
+      "num_tokens": 6428611.0,
+      "reward": -0.16267812252044678,
+      "reward_std": 0.5682471990585327,
+      "rewards/cosine_scaled_reward/mean": -0.2219640612602234,
+      "rewards/cosine_scaled_reward/std": 0.36739134788513184,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1907.0,
+      "completions/mean_length": 1609.171875,
+      "completions/mean_terminated_length": 924.5999755859375,
+      "completions/min_length": 513.0,
+      "completions/min_terminated_length": 513.0,
+      "epoch": 0.060571428571428575,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28155064582824707,
+      "learning_rate": 9.316216432703916e-07,
+      "loss": -0.0,
+      "num_tokens": 6542430.0,
+      "reward": 0.0752667784690857,
+      "reward_std": 0.7118167281150818,
+      "rewards/cosine_scaled_reward/mean": -0.18892911076545715,
+      "rewards/cosine_scaled_reward/std": 0.3222156763076782,
+      "rewards/format_reward/mean": 0.453125,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1986.0,
+      "completions/mean_length": 1588.234375,
+      "completions/mean_terminated_length": 1067.166748046875,
+      "completions/min_length": 519.0,
+      "completions/min_terminated_length": 519.0,
+      "epoch": 0.061714285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2555343806743622,
+      "learning_rate": 9.274017555754407e-07,
+      "loss": 0.0,
+      "num_tokens": 6655221.0,
+      "reward": 0.6341299414634705,
+      "reward_std": 1.0656921863555908,
+      "rewards/cosine_scaled_reward/mean": 0.05143994837999344,
+      "rewards/cosine_scaled_reward/std": 0.5348308086395264,
+      "rewards/format_reward/mean": 0.53125,
+      "rewards/format_reward/std": 0.5029674172401428,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1420.0,
+      "completions/mean_length": 1549.5625,
+      "completions/mean_terminated_length": 821.0769653320312,
+      "completions/min_length": 444.0,
+      "completions/min_terminated_length": 444.0,
+      "epoch": 0.06285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30243629217147827,
+      "learning_rate": 9.230669076497687e-07,
+      "loss": -0.0,
+      "num_tokens": 6764681.0,
+      "reward": 0.13021975755691528,
+      "reward_std": 0.3984764516353607,
+      "rewards/cosine_scaled_reward/mean": -0.13801513612270355,
+      "rewards/cosine_scaled_reward/std": 0.41228073835372925,
+      "rewards/format_reward/mean": 0.40625,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.546875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2040.0,
+      "completions/mean_length": 1633.25,
+      "completions/mean_terminated_length": 1132.689697265625,
+      "completions/min_length": 543.0,
+      "completions/min_terminated_length": 543.0,
+      "epoch": 0.064,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23835402727127075,
+      "learning_rate": 9.186184199300463e-07,
+      "loss": -0.0,
+      "num_tokens": 6880169.0,
+      "reward": 0.27981996536254883,
+      "reward_std": 0.5018116235733032,
+      "rewards/cosine_scaled_reward/mean": -0.10227750986814499,
+      "rewards/cosine_scaled_reward/std": 0.481824666261673,
+      "rewards/format_reward/mean": 0.484375,
+      "rewards/format_reward/std": 0.5037065148353577,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1807.0,
+      "completions/mean_length": 1699.875,
+      "completions/mean_terminated_length": 1156.7999267578125,
+      "completions/min_length": 642.0,
+      "completions/min_terminated_length": 642.0,
+      "epoch": 0.06514285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22349494695663452,
+      "learning_rate": 9.140576474687263e-07,
+      "loss": 0.0,
+      "num_tokens": 7000529.0,
+      "reward": -0.026505012065172195,
+      "reward_std": 0.5785415172576904,
+      "rewards/cosine_scaled_reward/mean": -0.20856501162052155,
+      "rewards/cosine_scaled_reward/std": 0.2749907374382019,
+      "rewards/format_reward/mean": 0.390625,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2044.0,
+      "completions/mean_length": 1457.875,
+      "completions/mean_terminated_length": 1054.105224609375,
+      "completions/min_length": 447.0,
+      "completions/min_terminated_length": 447.0,
+      "epoch": 0.06628571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.261942595243454,
+      "learning_rate": 9.093859795212817e-07,
+      "loss": 0.0,
+      "num_tokens": 7103929.0,
+      "reward": 0.5745843648910522,
+      "reward_std": 0.8671218156814575,
+      "rewards/cosine_scaled_reward/mean": -0.03302033245563507,
+      "rewards/cosine_scaled_reward/std": 0.45529407262802124,
+      "rewards/format_reward/mean": 0.640625,
+      "rewards/format_reward/std": 0.4836103618144989,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.484375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2010.0,
+      "completions/mean_length": 1590.0625,
+      "completions/mean_terminated_length": 1159.8787841796875,
+      "completions/min_length": 591.0,
+      "completions/min_terminated_length": 591.0,
+      "epoch": 0.06742857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24828943610191345,
+      "learning_rate": 9.046048391230247e-07,
+      "loss": -0.0,
+      "num_tokens": 7216157.0,
+      "reward": 0.3377103805541992,
+      "reward_std": 0.5543617010116577,
+      "rewards/cosine_scaled_reward/mean": -0.1045822948217392,
+      "rewards/cosine_scaled_reward/std": 0.39040952920913696,
+      "rewards/format_reward/mean": 0.546875,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1976.0,
+      "completions/mean_length": 1622.84375,
+      "completions/mean_terminated_length": 1076.21435546875,
+      "completions/min_length": 555.0,
+      "completions/min_terminated_length": 555.0,
+      "epoch": 0.06857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2752656936645508,
+      "learning_rate": 8.997156826556369e-07,
+      "loss": -0.0,
+      "num_tokens": 7330907.0,
+      "reward": 0.11114693433046341,
+      "reward_std": 0.6926254034042358,
+      "rewards/cosine_scaled_reward/mean": -0.1788015365600586,
+      "rewards/cosine_scaled_reward/std": 0.39409172534942627,
+      "rewards/format_reward/mean": 0.46875,
+      "rewards/format_reward/std": 0.5029674172401428,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1984.0,
+      "completions/mean_length": 1708.859375,
+      "completions/mean_terminated_length": 1014.4285888671875,
+      "completions/min_length": 411.0,
+      "completions/min_terminated_length": 411.0,
+      "epoch": 0.06971428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22669929265975952,
+      "learning_rate": 8.9471999940354e-07,
+      "loss": -0.0,
+      "num_tokens": 7451794.0,
+      "reward": 0.2345120906829834,
+      "reward_std": 0.6293160319328308,
+      "rewards/cosine_scaled_reward/mean": -0.1093064472079277,
+      "rewards/cosine_scaled_reward/std": 0.29189831018447876,
+      "rewards/format_reward/mean": 0.453125,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2008.0,
+      "completions/mean_length": 1281.53125,
+      "completions/mean_terminated_length": 1004.2978515625,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "epoch": 0.07085714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25438693165779114,
+      "learning_rate": 8.896193111002475e-07,
+      "loss": 0.0,
+      "num_tokens": 7544044.0,
+      "reward": 0.9180847406387329,
+      "reward_std": 0.6390912532806396,
+      "rewards/cosine_scaled_reward/mean": 0.06841734796762466,
+      "rewards/cosine_scaled_reward/std": 0.48315128684043884,
+      "rewards/format_reward/mean": 0.78125,
+      "rewards/format_reward/std": 0.4166666865348816,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.359375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1678.0,
+      "completions/mean_length": 1310.46875,
+      "completions/mean_terminated_length": 896.731689453125,
+      "completions/min_length": 295.0,
+      "completions/min_terminated_length": 295.0,
+      "epoch": 0.072,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28795576095581055,
+      "learning_rate": 8.844151714648274e-07,
+      "loss": -0.0,
+      "num_tokens": 7638170.0,
+      "reward": 0.6488770246505737,
+      "reward_std": 0.7876260876655579,
+      "rewards/cosine_scaled_reward/mean": -0.019311510026454926,
+      "rewards/cosine_scaled_reward/std": 0.4736698865890503,
+      "rewards/format_reward/mean": 0.6875,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1969.0,
+      "completions/mean_length": 1307.625,
+      "completions/mean_terminated_length": 1039.8297119140625,
+      "completions/min_length": 376.0,
+      "completions/min_terminated_length": 376.0,
+      "epoch": 0.07314285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25637197494506836,
+      "learning_rate": 8.791091657286267e-07,
+      "loss": -0.0,
+      "num_tokens": 7732810.0,
+      "reward": 0.8280279636383057,
+      "reward_std": 0.6804471015930176,
+      "rewards/cosine_scaled_reward/mean": 0.015576483681797981,
+      "rewards/cosine_scaled_reward/std": 0.44819310307502747,
+      "rewards/format_reward/mean": 0.796875,
+      "rewards/format_reward/std": 0.40550529956817627,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.359375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1846.0,
+      "completions/mean_length": 1322.125,
+      "completions/mean_terminated_length": 914.9268188476562,
+      "completions/min_length": 297.0,
+      "completions/min_terminated_length": 297.0,
+      "epoch": 0.07428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2944399118423462,
+      "learning_rate": 8.737029101523929e-07,
+      "loss": -0.0,
+      "num_tokens": 7828130.0,
+      "reward": 0.15610456466674805,
+      "reward_std": 0.4606686234474182,
+      "rewards/cosine_scaled_reward/mean": -0.24226020276546478,
+      "rewards/cosine_scaled_reward/std": 0.33131492137908936,
+      "rewards/format_reward/mean": 0.640625,
+      "rewards/format_reward/std": 0.4836103618144989,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1803.0,
+      "completions/mean_length": 1020.21875,
+      "completions/mean_terminated_length": 806.9057006835938,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 0.07542857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.32644009590148926,
+      "learning_rate": 8.681980515339463e-07,
+      "loss": 0.0,
+      "num_tokens": 7903656.0,
+      "reward": 0.7972471714019775,
+      "reward_std": 0.7674820423126221,
+      "rewards/cosine_scaled_reward/mean": -0.031063925474882126,
+      "rewards/cosine_scaled_reward/std": 0.5106223225593567,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1910.0,
+      "completions/mean_length": 1750.859375,
+      "completions/mean_terminated_length": 1142.4285888671875,
+      "completions/min_length": 585.0,
+      "completions/min_terminated_length": 585.0,
+      "epoch": 0.07657142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2270829975605011,
+      "learning_rate": 8.625962667065487e-07,
+      "loss": 0.0,
+      "num_tokens": 8026447.0,
+      "reward": -0.1400720775127411,
+      "reward_std": 0.3325888514518738,
+      "rewards/cosine_scaled_reward/mean": -0.24972353875637054,
+      "rewards/cosine_scaled_reward/std": 0.16404789686203003,
+      "rewards/format_reward/mean": 0.359375,
+      "rewards/format_reward/std": 0.4836103618144989,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1424.0,
+      "completions/mean_length": 769.546875,
+      "completions/mean_terminated_length": 637.2930908203125,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 0.07771428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.37025144696235657,
+      "learning_rate": 8.568992620281243e-07,
+      "loss": -0.0,
+      "num_tokens": 8084954.0,
+      "reward": 0.9792699813842773,
+      "reward_std": 0.804767370223999,
+      "rewards/cosine_scaled_reward/mean": 0.03651002421975136,
+      "rewards/cosine_scaled_reward/std": 0.46041443943977356,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1701.0,
+      "completions/mean_length": 1086.234375,
+      "completions/mean_terminated_length": 886.6226806640625,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 0.07885714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3763800263404846,
+      "learning_rate": 8.511087728614862e-07,
+      "loss": 0.0,
+      "num_tokens": 8164817.0,
+      "reward": 0.35803771018981934,
+      "reward_std": 0.5702667236328125,
+      "rewards/cosine_scaled_reward/mean": -0.24285613000392914,
+      "rewards/cosine_scaled_reward/std": 0.3019825220108032,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1898.0,
+      "completions/mean_length": 1463.375,
+      "completions/mean_terminated_length": 1112.5999755859375,
+      "completions/min_length": 503.0,
+      "completions/min_terminated_length": 503.0,
+      "epoch": 0.08,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24232418835163116,
+      "learning_rate": 8.452265630457282e-07,
+      "loss": -0.0,
+      "num_tokens": 8269929.0,
+      "reward": 0.3703588843345642,
+      "reward_std": 0.7288752794265747,
+      "rewards/cosine_scaled_reward/mean": -0.1351330280303955,
+      "rewards/cosine_scaled_reward/std": 0.3751916289329529,
+      "rewards/format_reward/mean": 0.640625,
+      "rewards/format_reward/std": 0.4836103618144989,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1991.0,
+      "completions/mean_length": 1409.859375,
+      "completions/mean_terminated_length": 973.2368774414062,
+      "completions/min_length": 404.0,
+      "completions/min_terminated_length": 404.0,
+      "epoch": 0.08114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.300010621547699,
+      "learning_rate": 8.392544243589427e-07,
+      "loss": 0.0,
+      "num_tokens": 8370880.0,
+      "reward": 0.5196826457977295,
+      "reward_std": 0.7097917795181274,
+      "rewards/cosine_scaled_reward/mean": -0.044846177101135254,
+      "rewards/cosine_scaled_reward/std": 0.508389949798584,
+      "rewards/format_reward/mean": 0.609375,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1801.0,
+      "completions/mean_length": 1228.046875,
+      "completions/mean_terminated_length": 931.4680786132812,
+      "completions/min_length": 331.0,
+      "completions/min_terminated_length": 331.0,
+      "epoch": 0.08228571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30454304814338684,
+      "learning_rate": 8.331941759724268e-07,
+      "loss": -0.0,
+      "num_tokens": 8459827.0,
+      "reward": 0.41365131735801697,
+      "reward_std": 0.5005639791488647,
+      "rewards/cosine_scaled_reward/mean": -0.1759868562221527,
+      "rewards/cosine_scaled_reward/std": 0.19868774712085724,
+      "rewards/format_reward/mean": 0.765625,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1719.0,
+      "completions/mean_length": 1513.28125,
+      "completions/mean_terminated_length": 1192.4500732421875,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "epoch": 0.08342857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27848970890045166,
+      "learning_rate": 8.270476638965461e-07,
+      "loss": -0.0,
+      "num_tokens": 8567405.0,
+      "reward": 0.09570223093032837,
+      "reward_std": 0.5445049405097961,
+      "rewards/cosine_scaled_reward/mean": -0.2802739143371582,
+      "rewards/cosine_scaled_reward/std": 0.25603488087654114,
+      "rewards/format_reward/mean": 0.65625,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1888.0,
+      "completions/mean_length": 1240.125,
+      "completions/mean_terminated_length": 924.0,
+      "completions/min_length": 530.0,
+      "completions/min_terminated_length": 530.0,
+      "epoch": 0.08457142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2789021134376526,
+      "learning_rate": 8.208167604184217e-07,
+      "loss": 0.0,
+      "num_tokens": 8656701.0,
+      "reward": 0.7823752760887146,
+      "reward_std": 0.6479132175445557,
+      "rewards/cosine_scaled_reward/mean": 0.031812600791454315,
+      "rewards/cosine_scaled_reward/std": 0.5397623181343079,
+      "rewards/format_reward/mean": 0.71875,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2036.0,
+      "completions/mean_length": 1455.953125,
+      "completions/mean_terminated_length": 1186.8409423828125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "epoch": 0.08571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22443196177482605,
+      "learning_rate": 8.145033635316128e-07,
+      "loss": 0.0,
+      "num_tokens": 8760842.0,
+      "reward": 0.8040015697479248,
+      "reward_std": 0.5675323009490967,
+      "rewards/cosine_scaled_reward/mean": 0.027000809088349342,
+      "rewards/cosine_scaled_reward/std": 0.5096040964126587,
+      "rewards/format_reward/mean": 0.75,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1720.0,
+      "completions/mean_length": 1177.859375,
+      "completions/mean_terminated_length": 863.1276245117188,
+      "completions/min_length": 372.0,
+      "completions/min_terminated_length": 372.0,
+      "epoch": 0.08685714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.32647648453712463,
+      "learning_rate": 8.081093963579707e-07,
+      "loss": 0.0,
+      "num_tokens": 8846625.0,
+      "reward": 0.310506671667099,
+      "reward_std": 0.5110941529273987,
+      "rewards/cosine_scaled_reward/mean": -0.2119341641664505,
+      "rewards/cosine_scaled_reward/std": 0.24737994372844696,
+      "rewards/format_reward/mean": 0.734375,
+      "rewards/format_reward/std": 0.44515693187713623,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1787.0,
+      "completions/mean_length": 1263.4375,
+      "completions/mean_terminated_length": 1043.760009765625,
+      "completions/min_length": 501.0,
+      "completions/min_terminated_length": 501.0,
+      "epoch": 0.088,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2545543611049652,
+      "learning_rate": 8.01636806561836e-07,
+      "loss": -0.0,
+      "num_tokens": 8939061.0,
+      "reward": 0.5484907031059265,
+      "reward_std": 0.48998576402664185,
+      "rewards/cosine_scaled_reward/mean": -0.13200464844703674,
+      "rewards/cosine_scaled_reward/std": 0.3430649936199188,
+      "rewards/format_reward/mean": 0.8125,
+      "rewards/format_reward/std": 0.39339789748191833,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1991.0,
+      "completions/mean_length": 1460.78125,
+      "completions/mean_terminated_length": 1059.0,
+      "completions/min_length": 430.0,
+      "completions/min_terminated_length": 430.0,
+      "epoch": 0.08914285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2583931088447571,
+      "learning_rate": 7.950875657567621e-07,
+      "loss": 0.0,
+      "num_tokens": 9043271.0,
+      "reward": 0.6075442433357239,
+      "reward_std": 0.6895643472671509,
+      "rewards/cosine_scaled_reward/mean": -0.0009153857827186584,
+      "rewards/cosine_scaled_reward/std": 0.48922818899154663,
+      "rewards/format_reward/mean": 0.609375,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1956.0,
+      "completions/mean_length": 1054.875,
+      "completions/mean_terminated_length": 892.3635864257812,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 0.09028571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29089078307151794,
+      "learning_rate": 7.884636689049422e-07,
+      "loss": 0.0,
+      "num_tokens": 9120879.0,
+      "reward": 0.6885831356048584,
+      "reward_std": 0.508629322052002,
+      "rewards/cosine_scaled_reward/mean": -0.09320840239524841,
+      "rewards/cosine_scaled_reward/std": 0.38835227489471436,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2037.0,
+      "completions/mean_length": 1399.046875,
+      "completions/mean_terminated_length": 1145.1087646484375,
+      "completions/min_length": 484.0,
+      "completions/min_terminated_length": 484.0,
+      "epoch": 0.09142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27458345890045166,
+      "learning_rate": 7.817671337095244e-07,
+      "loss": 0.0,
+      "num_tokens": 9220810.0,
+      "reward": 0.5549384355545044,
+      "reward_std": 0.7092134952545166,
+      "rewards/cosine_scaled_reward/mean": -0.09753081202507019,
+      "rewards/cosine_scaled_reward/std": 0.4125780463218689,
+      "rewards/format_reward/mean": 0.75,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1833.0,
+      "completions/mean_length": 1084.984375,
+      "completions/mean_terminated_length": 906.6481323242188,
+      "completions/min_length": 274.0,
+      "completions/min_terminated_length": 274.0,
+      "epoch": 0.09257142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.37247684597969055,
+      "learning_rate": 7.75e-07,
+      "loss": -0.0,
+      "num_tokens": 9301521.0,
+      "reward": 0.5357480049133301,
+      "reward_std": 0.5661624670028687,
+      "rewards/cosine_scaled_reward/mean": -0.18525099754333496,
+      "rewards/cosine_scaled_reward/std": 0.3385297954082489,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2046.0,
+      "completions/mean_length": 1260.921875,
+      "completions/mean_terminated_length": 998.5625,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "epoch": 0.09371428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27329322695732117,
+      "learning_rate": 7.681643291108517e-07,
+      "loss": -0.0,
+      "num_tokens": 9392548.0,
+      "reward": 0.9478914737701416,
+      "reward_std": 0.4313860237598419,
+      "rewards/cosine_scaled_reward/mean": 0.09894578158855438,
+      "rewards/cosine_scaled_reward/std": 0.5477120876312256,
+      "rewards/format_reward/mean": 0.75,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2039.0,
+      "completions/mean_length": 1309.671875,
+      "completions/mean_terminated_length": 922.9285888671875,
+      "completions/min_length": 303.0,
+      "completions/min_terminated_length": 303.0,
+      "epoch": 0.09485714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3202998638153076,
+      "learning_rate": 7.612622032536507e-07,
+      "loss": -0.0,
+      "num_tokens": 9487455.0,
+      "reward": 0.5201998949050903,
+      "reward_std": 0.6858996152877808,
+      "rewards/cosine_scaled_reward/mean": -0.09927503764629364,
+      "rewards/cosine_scaled_reward/std": 0.37909674644470215,
+      "rewards/format_reward/mean": 0.71875,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1685.0,
+      "completions/mean_length": 1185.703125,
+      "completions/mean_terminated_length": 965.9019775390625,
+      "completions/min_length": 390.0,
+      "completions/min_terminated_length": 390.0,
+      "epoch": 0.096,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29646041989326477,
+      "learning_rate": 7.54295724882796e-07,
+      "loss": -0.0,
+      "num_tokens": 9574036.0,
+      "reward": 0.6779025793075562,
+      "reward_std": 0.557724118232727,
+      "rewards/cosine_scaled_reward/mean": -0.09073619544506073,
+      "rewards/cosine_scaled_reward/std": 0.3855368196964264,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2021.0,
+      "completions/mean_length": 1297.828125,
+      "completions/mean_terminated_length": 1158.907470703125,
+      "completions/min_length": 601.0,
+      "completions/min_terminated_length": 601.0,
+      "epoch": 0.09714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21307455003261566,
+      "learning_rate": 7.472670160550848e-07,
+      "loss": 0.0,
+      "num_tokens": 9667417.0,
+      "reward": 0.5093189477920532,
+      "reward_std": 0.6006681323051453,
+      "rewards/cosine_scaled_reward/mean": -0.1672155261039734,
+      "rewards/cosine_scaled_reward/std": 0.34896284341812134,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1859.0,
+      "completions/mean_length": 1348.90625,
+      "completions/mean_terminated_length": 1096.04248046875,
+      "completions/min_length": 501.0,
+      "completions/min_terminated_length": 501.0,
+      "epoch": 0.09828571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2883393168449402,
+      "learning_rate": 7.401782177833147e-07,
+      "loss": -0.0,
+      "num_tokens": 9764603.0,
+      "reward": 0.8025823831558228,
+      "reward_std": 0.547119677066803,
+      "rewards/cosine_scaled_reward/mean": 0.01847870647907257,
+      "rewards/cosine_scaled_reward/std": 0.4346420168876648,
+      "rewards/format_reward/mean": 0.765625,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1782.0,
+      "completions/mean_length": 1086.96875,
+      "completions/mean_terminated_length": 909.0,
+      "completions/min_length": 350.0,
+      "completions/min_terminated_length": 350.0,
+      "epoch": 0.09942857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31888866424560547,
+      "learning_rate": 7.330314893841101e-07,
+      "loss": -0.0,
+      "num_tokens": 9844289.0,
+      "reward": 0.5533354878425598,
+      "reward_std": 0.5319498777389526,
+      "rewards/cosine_scaled_reward/mean": -0.1530197560787201,
+      "rewards/cosine_scaled_reward/std": 0.2434682846069336,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1984.0,
+      "completions/mean_length": 954.921875,
+      "completions/mean_terminated_length": 919.6612548828125,
+      "completions/min_length": 114.0,
+      "completions/min_terminated_length": 114.0,
+      "epoch": 0.10057142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3025936484336853,
+      "learning_rate": 7.258290078201731e-07,
+      "loss": -0.0,
+      "num_tokens": 9915916.0,
+      "reward": 1.2692296504974365,
+      "reward_std": 0.5115163326263428,
+      "rewards/cosine_scaled_reward/mean": 0.13461479544639587,
+      "rewards/cosine_scaled_reward/std": 0.506001353263855,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1926.0,
+      "completions/mean_length": 1351.8125,
+      "completions/mean_terminated_length": 1174.35302734375,
+      "completions/min_length": 650.0,
+      "completions/min_terminated_length": 650.0,
+      "epoch": 0.10171428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23423585295677185,
+      "learning_rate": 7.185729670371604e-07,
+      "loss": -0.0,
+      "num_tokens": 10013432.0,
+      "reward": 0.724889874458313,
+      "reward_std": 0.7425336837768555,
+      "rewards/cosine_scaled_reward/mean": -0.0828675627708435,
+      "rewards/cosine_scaled_reward/std": 0.3893774449825287,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1906.0,
+      "completions/mean_length": 1153.28125,
+      "completions/mean_terminated_length": 1025.46435546875,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "epoch": 0.10285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3860023021697998,
+      "learning_rate": 7.11265577295385e-07,
+      "loss": -0.0,
+      "num_tokens": 10097242.0,
+      "reward": 0.5000253915786743,
+      "reward_std": 0.5103108286857605,
+      "rewards/cosine_scaled_reward/mean": -0.18748730421066284,
+      "rewards/cosine_scaled_reward/std": 0.2787182629108429,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.328125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2032.0,
+      "completions/mean_length": 1455.484375,
+      "completions/mean_terminated_length": 1166.1163330078125,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "epoch": 0.104,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2551063895225525,
+      "learning_rate": 7.039090644965509e-07,
+      "loss": 0.0,
+      "num_tokens": 10200961.0,
+      "reward": 0.4053259789943695,
+      "reward_std": 0.663999617099762,
+      "rewards/cosine_scaled_reward/mean": -0.18796202540397644,
+      "rewards/cosine_scaled_reward/std": 0.35777655243873596,
+      "rewards/format_reward/mean": 0.78125,
+      "rewards/format_reward/std": 0.4166666865348816,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2004.0,
+      "completions/mean_length": 1176.953125,
+      "completions/mean_terminated_length": 1015.6481323242188,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "epoch": 0.10514285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27449366450309753,
+      "learning_rate": 6.965056695057204e-07,
+      "loss": -0.0,
+      "num_tokens": 10286278.0,
+      "reward": 0.5743436217308044,
+      "reward_std": 0.6229422092437744,
+      "rewards/cosine_scaled_reward/mean": -0.15032817423343658,
+      "rewards/cosine_scaled_reward/std": 0.2899566888809204,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2027.0,
+      "completions/mean_length": 1434.875,
+      "completions/mean_terminated_length": 1156.181884765625,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "epoch": 0.10628571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2839376926422119,
+      "learning_rate": 6.890576474687263e-07,
+      "loss": 0.0,
+      "num_tokens": 10389454.0,
+      "reward": 0.30658647418022156,
+      "reward_std": 0.5343226194381714,
+      "rewards/cosine_scaled_reward/mean": -0.22951926290988922,
+      "rewards/cosine_scaled_reward/std": 0.2324177473783493,
+      "rewards/format_reward/mean": 0.765625,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1684.0,
+      "completions/mean_length": 1242.390625,
+      "completions/mean_terminated_length": 927.1522216796875,
+      "completions/min_length": 508.0,
+      "completions/min_terminated_length": 508.0,
+      "epoch": 0.10742857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2985072433948517,
+      "learning_rate": 6.815672671252315e-07,
+      "loss": 0.0,
+      "num_tokens": 10478735.0,
+      "reward": 0.6593698263168335,
+      "reward_std": 0.5845412015914917,
+      "rewards/cosine_scaled_reward/mean": -0.02969011664390564,
+      "rewards/cosine_scaled_reward/std": 0.47056320309638977,
+      "rewards/format_reward/mean": 0.71875,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1931.0,
+      "completions/mean_length": 1203.265625,
+      "completions/mean_terminated_length": 1082.58935546875,
+      "completions/min_length": 573.0,
+      "completions/min_terminated_length": 573.0,
+      "epoch": 0.10857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2689598798751831,
+      "learning_rate": 6.740368101176495e-07,
+      "loss": 0.0,
+      "num_tokens": 10566272.0,
+      "reward": 0.4301251173019409,
+      "reward_std": 0.4795047640800476,
+      "rewards/cosine_scaled_reward/mean": -0.22243742644786835,
+      "rewards/cosine_scaled_reward/std": 0.2575407326221466,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1827.0,
+      "completions/mean_length": 1205.5625,
+      "completions/mean_terminated_length": 990.8235473632812,
+      "completions/min_length": 441.0,
+      "completions/min_terminated_length": 441.0,
+      "epoch": 0.10971428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30502915382385254,
+      "learning_rate": 6.664685702961344e-07,
+      "loss": -0.0,
+      "num_tokens": 10654564.0,
+      "reward": 0.896080493927002,
+      "reward_std": 0.6987663507461548,
+      "rewards/cosine_scaled_reward/mean": 0.02616523765027523,
+      "rewards/cosine_scaled_reward/std": 0.460237056016922,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1777.0,
+      "completions/mean_length": 1170.390625,
+      "completions/mean_terminated_length": 988.2453002929688,
+      "completions/min_length": 430.0,
+      "completions/min_terminated_length": 430.0,
+      "epoch": 0.11085714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3103901743888855,
+      "learning_rate": 6.588648530198504e-07,
+      "loss": -0.0,
+      "num_tokens": 10739733.0,
+      "reward": 0.6633297204971313,
+      "reward_std": 0.609075665473938,
+      "rewards/cosine_scaled_reward/mean": -0.12927262485027313,
+      "rewards/cosine_scaled_reward/std": 0.4114542305469513,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1817.0,
+      "completions/mean_length": 1136.5625,
+      "completions/mean_terminated_length": 947.396240234375,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "epoch": 0.112,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2510873079299927,
+      "learning_rate": 6.512279744547392e-07,
+      "loss": 0.0,
+      "num_tokens": 10823537.0,
+      "reward": 0.6613268256187439,
+      "reward_std": 0.4785424768924713,
+      "rewards/cosine_scaled_reward/mean": -0.09902409464120865,
+      "rewards/cosine_scaled_reward/std": 0.4345317482948303,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2031.0,
+      "completions/mean_length": 1171.8125,
+      "completions/mean_terminated_length": 1081.17236328125,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 0.11314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.281054824590683,
+      "learning_rate": 6.435602608679916e-07,
+      "loss": -0.0,
+      "num_tokens": 10909701.0,
+      "reward": 1.0416245460510254,
+      "reward_std": 0.6949809789657593,
+      "rewards/cosine_scaled_reward/mean": 0.0520622618496418,
+      "rewards/cosine_scaled_reward/std": 0.508481502532959,
+      "rewards/format_reward/mean": 0.9375,
+      "rewards/format_reward/std": 0.24397502839565277,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1935.0,
+      "completions/mean_length": 1120.8125,
+      "completions/mean_terminated_length": 1024.8966064453125,
+      "completions/min_length": 410.0,
+      "completions/min_terminated_length": 410.0,
+      "epoch": 0.11428571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2910788655281067,
+      "learning_rate": 6.358640479194451e-07,
+      "loss": 0.0,
+      "num_tokens": 10991145.0,
+      "reward": 1.2036188840866089,
+      "reward_std": 0.8533884286880493,
+      "rewards/cosine_scaled_reward/mean": 0.14087192714214325,
+      "rewards/cosine_scaled_reward/std": 0.5375887751579285,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1978.0,
+      "completions/mean_length": 1076.953125,
+      "completions/mean_terminated_length": 1029.1966552734375,
+      "completions/min_length": 423.0,
+      "completions/min_terminated_length": 423.0,
+      "epoch": 0.11542857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33955609798431396,
+      "learning_rate": 6.281416799501187e-07,
+      "loss": 0.0,
+      "num_tokens": 11071502.0,
+      "reward": 0.7810705900192261,
+      "reward_std": 0.5973731279373169,
+      "rewards/cosine_scaled_reward/mean": -0.10165221989154816,
+      "rewards/cosine_scaled_reward/std": 0.4130260646343231,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1959.0,
+      "completions/mean_length": 1092.078125,
+      "completions/mean_terminated_length": 935.654541015625,
+      "completions/min_length": 370.0,
+      "completions/min_terminated_length": 370.0,
+      "epoch": 0.11657142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34537607431411743,
+      "learning_rate": 6.203955092681039e-07,
+      "loss": 0.0,
+      "num_tokens": 11151547.0,
+      "reward": 0.6441041231155396,
+      "reward_std": 0.53089839220047,
+      "rewards/cosine_scaled_reward/mean": -0.10763543844223022,
+      "rewards/cosine_scaled_reward/std": 0.39948928356170654,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2039.0,
+      "completions/mean_length": 1120.625,
+      "completions/mean_terminated_length": 1006.7368774414062,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 0.11771428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.343980997800827,
+      "learning_rate": 6.126278954320294e-07,
+      "loss": 0.0,
+      "num_tokens": 11233619.0,
+      "reward": 0.6925251483917236,
+      "reward_std": 0.5938367247581482,
+      "rewards/cosine_scaled_reward/mean": -0.13029994070529938,
+      "rewards/cosine_scaled_reward/std": 0.37749138474464417,
+      "rewards/format_reward/mean": 0.953125,
+      "rewards/format_reward/std": 0.21304203569889069,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1913.0,
+      "completions/mean_length": 1120.359375,
+      "completions/mean_terminated_length": 948.5740966796875,
+      "completions/min_length": 345.0,
+      "completions/min_terminated_length": 345.0,
+      "epoch": 0.11885714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30854102969169617,
+      "learning_rate": 6.048412045323164e-07,
+      "loss": -0.0,
+      "num_tokens": 11315786.0,
+      "reward": 0.560060977935791,
+      "reward_std": 0.5216183662414551,
+      "rewards/cosine_scaled_reward/mean": -0.1418444812297821,
+      "rewards/cosine_scaled_reward/std": 0.33836889266967773,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1897.0,
+      "completions/mean_length": 1158.421875,
+      "completions/mean_terminated_length": 953.1346435546875,
+      "completions/min_length": 503.0,
+      "completions/min_terminated_length": 503.0,
+      "epoch": 0.12,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29531243443489075,
+      "learning_rate": 5.97037808470444e-07,
+      "loss": -0.0,
+      "num_tokens": 11401213.0,
+      "reward": 1.0410652160644531,
+      "reward_std": 0.7858219742774963,
+      "rewards/cosine_scaled_reward/mean": 0.09084508568048477,
+      "rewards/cosine_scaled_reward/std": 0.5061684250831604,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1867.0,
+      "completions/mean_length": 1045.859375,
+      "completions/mean_terminated_length": 837.867919921875,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "epoch": 0.12114285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26259294152259827,
+      "learning_rate": 5.892200842364462e-07,
+      "loss": -0.0,
+      "num_tokens": 11478980.0,
+      "reward": 1.0545225143432617,
+      "reward_std": 0.7633667588233948,
+      "rewards/cosine_scaled_reward/mean": 0.07413630187511444,
+      "rewards/cosine_scaled_reward/std": 0.48842984437942505,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1999.0,
+      "completions/mean_length": 1101.234375,
+      "completions/mean_terminated_length": 946.30908203125,
+      "completions/min_length": 346.0,
+      "completions/min_terminated_length": 346.0,
+      "epoch": 0.12228571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3363504409790039,
+      "learning_rate": 5.813904131848564e-07,
+      "loss": 0.0,
+      "num_tokens": 11560611.0,
+      "reward": 0.648673415184021,
+      "reward_std": 0.6051540970802307,
+      "rewards/cosine_scaled_reward/mean": -0.11316327750682831,
+      "rewards/cosine_scaled_reward/std": 0.37149766087532043,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1857.0,
+      "completions/mean_length": 1225.28125,
+      "completions/mean_terminated_length": 1054.5283203125,
+      "completions/min_length": 515.0,
+      "completions/min_terminated_length": 515.0,
+      "epoch": 0.12342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2867675721645355,
+      "learning_rate": 5.735511803093248e-07,
+      "loss": 0.0,
+      "num_tokens": 11649389.0,
+      "reward": 0.560509204864502,
+      "reward_std": 0.6691359877586365,
+      "rewards/cosine_scaled_reward/mean": -0.14943289756774902,
+      "rewards/cosine_scaled_reward/std": 0.4461749494075775,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2001.0,
+      "completions/mean_length": 1227.203125,
+      "completions/mean_terminated_length": 1056.84912109375,
+      "completions/min_length": 513.0,
+      "completions/min_terminated_length": 513.0,
+      "epoch": 0.12457142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2772690951824188,
+      "learning_rate": 5.657047735161255e-07,
+      "loss": -0.0,
+      "num_tokens": 11739178.0,
+      "reward": 0.6980891227722168,
+      "reward_std": 0.624833345413208,
+      "rewards/cosine_scaled_reward/mean": -0.0650179386138916,
+      "rewards/cosine_scaled_reward/std": 0.41062912344932556,
+      "rewards/format_reward/mean": 0.828125,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1727.0,
+      "completions/mean_length": 1145.0,
+      "completions/mean_terminated_length": 914.8235473632812,
+      "completions/min_length": 414.0,
+      "completions/min_terminated_length": 414.0,
+      "epoch": 0.12571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3468596637248993,
+      "learning_rate": 5.578535828967777e-07,
+      "loss": -0.0,
+      "num_tokens": 11823234.0,
+      "reward": 0.6972323656082153,
+      "reward_std": 0.5477026104927063,
+      "rewards/cosine_scaled_reward/mean": -0.08888379484415054,
+      "rewards/cosine_scaled_reward/std": 0.3565239906311035,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1969.0,
+      "completions/max_terminated_length": 1969.0,
+      "completions/mean_length": 977.046875,
+      "completions/mean_terminated_length": 977.046875,
+      "completions/min_length": 332.0,
+      "completions/min_terminated_length": 332.0,
+      "epoch": 0.12685714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3180137574672699,
+      "learning_rate": 5.5e-07,
+      "loss": 0.0,
+      "num_tokens": 11895885.0,
+      "reward": 0.8744360208511353,
+      "reward_std": 0.5815237164497375,
+      "rewards/cosine_scaled_reward/mean": -0.06278196722269058,
+      "rewards/cosine_scaled_reward/std": 0.37791064381599426,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1954.0,
+      "completions/mean_length": 1269.421875,
+      "completions/mean_terminated_length": 1089.75,
+      "completions/min_length": 605.0,
+      "completions/min_terminated_length": 605.0,
+      "epoch": 0.128,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2817465364933014,
+      "learning_rate": 5.421464171032224e-07,
+      "loss": -0.0,
+      "num_tokens": 11988224.0,
+      "reward": 0.9151681065559387,
+      "reward_std": 0.594943642616272,
+      "rewards/cosine_scaled_reward/mean": 0.02789657562971115,
+      "rewards/cosine_scaled_reward/std": 0.4965399205684662,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1910.0,
+      "completions/max_terminated_length": 1910.0,
+      "completions/mean_length": 934.578125,
+      "completions/mean_terminated_length": 934.578125,
+      "completions/min_length": 326.0,
+      "completions/min_terminated_length": 326.0,
+      "epoch": 0.12914285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3341560959815979,
+      "learning_rate": 5.342952264838747e-07,
+      "loss": -0.0,
+      "num_tokens": 12058333.0,
+      "reward": 1.0256879329681396,
+      "reward_std": 0.717230498790741,
+      "rewards/cosine_scaled_reward/mean": 0.02065650373697281,
+      "rewards/cosine_scaled_reward/std": 0.4963410794734955,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1911.0,
+      "completions/mean_length": 1055.21875,
+      "completions/mean_terminated_length": 971.0847778320312,
+      "completions/min_length": 124.0,
+      "completions/min_terminated_length": 124.0,
+      "epoch": 0.13028571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3800676763057709,
+      "learning_rate": 5.264488196906752e-07,
+      "loss": -0.0,
+      "num_tokens": 12135715.0,
+      "reward": 0.649993896484375,
+      "reward_std": 0.5865596532821655,
+      "rewards/cosine_scaled_reward/mean": -0.1750030517578125,
+      "rewards/cosine_scaled_reward/std": 0.3388007879257202,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1981.0,
+      "completions/mean_length": 1169.671875,
+      "completions/mean_terminated_length": 987.3773803710938,
+      "completions/min_length": 324.0,
+      "completions/min_terminated_length": 324.0,
+      "epoch": 0.13142857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3112519085407257,
+      "learning_rate": 5.186095868151436e-07,
+      "loss": 0.0,
+      "num_tokens": 12221790.0,
+      "reward": 0.7184536457061768,
+      "reward_std": 0.44992831349372864,
+      "rewards/cosine_scaled_reward/mean": -0.06264819949865341,
+      "rewards/cosine_scaled_reward/std": 0.44565486907958984,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1936.0,
+      "completions/mean_length": 1224.890625,
+      "completions/mean_terminated_length": 1072.4630126953125,
+      "completions/min_length": 393.0,
+      "completions/min_terminated_length": 393.0,
+      "epoch": 0.13257142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2884223461151123,
+      "learning_rate": 5.107799157635538e-07,
+      "loss": 0.0,
+      "num_tokens": 12311567.0,
+      "reward": 0.8372049927711487,
+      "reward_std": 0.608986496925354,
+      "rewards/cosine_scaled_reward/mean": -0.026710007339715958,
+      "rewards/cosine_scaled_reward/std": 0.4437602162361145,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1927.0,
+      "completions/mean_length": 1078.65625,
+      "completions/mean_terminated_length": 1030.9835205078125,
+      "completions/min_length": 494.0,
+      "completions/min_terminated_length": 494.0,
+      "epoch": 0.1337142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3016076385974884,
+      "learning_rate": 5.02962191529556e-07,
+      "loss": -0.0,
+      "num_tokens": 12391625.0,
+      "reward": 0.8182538747787476,
+      "reward_std": 0.6463132500648499,
+      "rewards/cosine_scaled_reward/mean": -0.09087307006120682,
+      "rewards/cosine_scaled_reward/std": 0.3895137310028076,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1946.0,
+      "completions/mean_length": 1226.046875,
+      "completions/mean_terminated_length": 952.0625,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "epoch": 0.13485714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2991194427013397,
+      "learning_rate": 4.951587954676837e-07,
+      "loss": 0.0,
+      "num_tokens": 12480628.0,
+      "reward": 0.6370267868041992,
+      "reward_std": 0.7525250911712646,
+      "rewards/cosine_scaled_reward/mean": -0.056486621499061584,
+      "rewards/cosine_scaled_reward/std": 0.44576171040534973,
+      "rewards/format_reward/mean": 0.75,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2020.0,
+      "completions/mean_length": 1038.96875,
+      "completions/mean_terminated_length": 894.8214721679688,
+      "completions/min_length": 149.0,
+      "completions/min_terminated_length": 149.0,
+      "epoch": 0.136,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.4483291506767273,
+      "learning_rate": 4.873721045679706e-07,
+      "loss": 0.0,
+      "num_tokens": 12557530.0,
+      "reward": 0.9855979084968567,
+      "reward_std": 0.6055079698562622,
+      "rewards/cosine_scaled_reward/mean": 0.04748644679784775,
+      "rewards/cosine_scaled_reward/std": 0.47108832001686096,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1947.0,
+      "completions/mean_length": 856.578125,
+      "completions/mean_terminated_length": 818.1451416015625,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 0.13714285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3406151831150055,
+      "learning_rate": 4.79604490731896e-07,
+      "loss": -0.0,
+      "num_tokens": 12622807.0,
+      "reward": 0.7979192733764648,
+      "reward_std": 0.6180044412612915,
+      "rewards/cosine_scaled_reward/mean": -0.10104038566350937,
+      "rewards/cosine_scaled_reward/std": 0.44317325949668884,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1764.0,
+      "completions/mean_length": 726.34375,
+      "completions/mean_terminated_length": 683.7096557617188,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 0.1382857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.4178949296474457,
+      "learning_rate": 4.7185832004988133e-07,
+      "loss": 0.0,
+      "num_tokens": 12678989.0,
+      "reward": 1.161607265472412,
+      "reward_std": 0.6393733024597168,
+      "rewards/cosine_scaled_reward/mean": 0.08080361783504486,
+      "rewards/cosine_scaled_reward/std": 0.5313310027122498,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2003.0,
+      "completions/mean_length": 1133.796875,
+      "completions/mean_terminated_length": 1039.22412109375,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 0.13942857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3333284258842468,
+      "learning_rate": 4.641359520805548e-07,
+      "loss": 0.0,
+      "num_tokens": 12763112.0,
+      "reward": 0.9356573820114136,
+      "reward_std": 0.6247758269309998,
+      "rewards/cosine_scaled_reward/mean": -0.02435879409313202,
+      "rewards/cosine_scaled_reward/std": 0.4759780466556549,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1993.0,
+      "completions/mean_length": 1031.296875,
+      "completions/mean_terminated_length": 981.2950439453125,
+      "completions/min_length": 447.0,
+      "completions/min_terminated_length": 447.0,
+      "epoch": 0.14057142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29939791560173035,
+      "learning_rate": 4.5643973913200837e-07,
+      "loss": -0.0,
+      "num_tokens": 12839347.0,
+      "reward": 0.7725162506103516,
+      "reward_std": 0.5560778379440308,
+      "rewards/cosine_scaled_reward/mean": -0.09811685979366302,
+      "rewards/cosine_scaled_reward/std": 0.3822804391384125,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2011.0,
+      "completions/mean_length": 979.234375,
+      "completions/mean_terminated_length": 944.758056640625,
+      "completions/min_length": 274.0,
+      "completions/min_terminated_length": 274.0,
+      "epoch": 0.1417142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34992095828056335,
+      "learning_rate": 4.4877202554526084e-07,
+      "loss": 0.0,
+      "num_tokens": 12912970.0,
+      "reward": 1.085427165031433,
+      "reward_std": 0.6837464570999146,
+      "rewards/cosine_scaled_reward/mean": 0.05052608996629715,
+      "rewards/cosine_scaled_reward/std": 0.4791998267173767,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1993.0,
+      "completions/mean_length": 1076.40625,
+      "completions/mean_terminated_length": 994.0678100585938,
+      "completions/min_length": 309.0,
+      "completions/min_terminated_length": 309.0,
+      "epoch": 0.14285714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27060386538505554,
+      "learning_rate": 4.4113514698014953e-07,
+      "loss": -0.0,
+      "num_tokens": 12992788.0,
+      "reward": 1.0397578477859497,
+      "reward_std": 0.43823006749153137,
+      "rewards/cosine_scaled_reward/mean": 0.019878946244716644,
+      "rewards/cosine_scaled_reward/std": 0.46214956045150757,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1856.0,
+      "completions/mean_length": 1071.53125,
+      "completions/mean_terminated_length": 1006.4334106445312,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "epoch": 0.144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2776121497154236,
+      "learning_rate": 4.3353142970386557e-07,
+      "loss": 0.0,
+      "num_tokens": 13072662.0,
+      "reward": 1.0028693675994873,
+      "reward_std": 0.6879971027374268,
+      "rewards/cosine_scaled_reward/mean": 0.0014346465468406677,
+      "rewards/cosine_scaled_reward/std": 0.42488595843315125,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1868.0,
+      "completions/mean_length": 1180.484375,
+      "completions/mean_terminated_length": 1056.5535888671875,
+      "completions/min_length": 285.0,
+      "completions/min_terminated_length": 285.0,
+      "epoch": 0.14514285714285713,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2829054594039917,
+      "learning_rate": 4.2596318988235037e-07,
+      "loss": -0.0,
+      "num_tokens": 13159309.0,
+      "reward": 0.6576684713363647,
+      "reward_std": 0.66895592212677,
+      "rewards/cosine_scaled_reward/mean": -0.15554077923297882,
+      "rewards/cosine_scaled_reward/std": 0.3959099054336548,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1869.0,
+      "completions/mean_length": 1053.328125,
+      "completions/mean_terminated_length": 950.4310302734375,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "epoch": 0.1462857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29738253355026245,
+      "learning_rate": 4.1843273287476854e-07,
+      "loss": -0.0,
+      "num_tokens": 13237074.0,
+      "reward": 0.8851851224899292,
+      "reward_std": 0.7390589118003845,
+      "rewards/cosine_scaled_reward/mean": -0.041782446205616,
+      "rewards/cosine_scaled_reward/std": 0.46901625394821167,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1927.0,
+      "completions/mean_length": 1228.484375,
+      "completions/mean_terminated_length": 1111.4107666015625,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "epoch": 0.14742857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25943535566329956,
+      "learning_rate": 4.1094235253127374e-07,
+      "loss": -0.0,
+      "num_tokens": 13326401.0,
+      "reward": 0.9628820419311523,
+      "reward_std": 0.6490253210067749,
+      "rewards/cosine_scaled_reward/mean": 0.004878522828221321,
+      "rewards/cosine_scaled_reward/std": 0.45456331968307495,
+      "rewards/format_reward/mean": 0.953125,
+      "rewards/format_reward/std": 0.21304203569889069,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1967.0,
+      "completions/mean_length": 1089.578125,
+      "completions/mean_terminated_length": 952.6607666015625,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "epoch": 0.14857142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3009719252586365,
+      "learning_rate": 4.034943304942796e-07,
+      "loss": 0.0,
+      "num_tokens": 13406638.0,
+      "reward": 0.5984547138214111,
+      "reward_std": 0.7008002996444702,
+      "rewards/cosine_scaled_reward/mean": -0.14608514308929443,
+      "rewards/cosine_scaled_reward/std": 0.37894922494888306,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1651.0,
+      "completions/mean_length": 1058.03125,
+      "completions/mean_terminated_length": 916.607177734375,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "epoch": 0.14971428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.306725412607193,
+      "learning_rate": 3.9609093550344907e-07,
+      "loss": 0.0,
+      "num_tokens": 13484088.0,
+      "reward": 1.0469268560409546,
+      "reward_std": 0.6023457050323486,
+      "rewards/cosine_scaled_reward/mean": 0.0703384131193161,
+      "rewards/cosine_scaled_reward/std": 0.47298464179039,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1773.0,
+      "completions/mean_length": 1342.78125,
+      "completions/mean_terminated_length": 919.6500244140625,
+      "completions/min_length": 366.0,
+      "completions/min_terminated_length": 366.0,
+      "epoch": 0.15085714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3032574951648712,
+      "learning_rate": 3.8873442270461485e-07,
+      "loss": -0.0,
+      "num_tokens": 13581090.0,
+      "reward": 0.4643245339393616,
+      "reward_std": 0.7533800601959229,
+      "rewards/cosine_scaled_reward/mean": -0.06471271812915802,
+      "rewards/cosine_scaled_reward/std": 0.4610835611820221,
+      "rewards/format_reward/mean": 0.59375,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1974.0,
+      "completions/mean_length": 1144.921875,
+      "completions/mean_terminated_length": 957.4906005859375,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "epoch": 0.152,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.32285141944885254,
+      "learning_rate": 3.8142703296283953e-07,
+      "loss": 0.0,
+      "num_tokens": 13665589.0,
+      "reward": 0.5014957189559937,
+      "reward_std": 0.5352932214736938,
+      "rewards/cosine_scaled_reward/mean": -0.17112717032432556,
+      "rewards/cosine_scaled_reward/std": 0.28127768635749817,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1965.0,
+      "completions/mean_length": 975.53125,
+      "completions/mean_terminated_length": 958.5079956054688,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 0.15314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.40716752409935,
+      "learning_rate": 3.7417099217982686e-07,
+      "loss": -0.0,
+      "num_tokens": 13738591.0,
+      "reward": 1.1759617328643799,
+      "reward_std": 0.4804629683494568,
+      "rewards/cosine_scaled_reward/mean": 0.08798093348741531,
+      "rewards/cosine_scaled_reward/std": 0.5343761444091797,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1686.0,
+      "completions/max_terminated_length": 1686.0,
+      "completions/mean_length": 758.515625,
+      "completions/mean_terminated_length": 758.515625,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 0.15428571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.42696353793144226,
+      "learning_rate": 3.6696851061588994e-07,
+      "loss": -0.0,
+      "num_tokens": 13797608.0,
+      "reward": 1.3851683139801025,
+      "reward_std": 0.5234883427619934,
+      "rewards/cosine_scaled_reward/mean": 0.19258417189121246,
+      "rewards/cosine_scaled_reward/std": 0.49346473813056946,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2007.0,
+      "completions/mean_length": 1169.875,
+      "completions/mean_terminated_length": 1095.4576416015625,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "epoch": 0.15542857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28027620911598206,
+      "learning_rate": 3.5982178221668533e-07,
+      "loss": -0.0,
+      "num_tokens": 13883152.0,
+      "reward": 1.0174503326416016,
+      "reward_std": 0.5889347791671753,
+      "rewards/cosine_scaled_reward/mean": 0.016537662595510483,
+      "rewards/cosine_scaled_reward/std": 0.4763922095298767,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1814.0,
+      "completions/mean_length": 1105.3125,
+      "completions/mean_terminated_length": 1042.4666748046875,
+      "completions/min_length": 446.0,
+      "completions/min_terminated_length": 446.0,
+      "epoch": 0.15657142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3002299666404724,
+      "learning_rate": 3.5273298394491515e-07,
+      "loss": 0.0,
+      "num_tokens": 13964500.0,
+      "reward": 0.841381847858429,
+      "reward_std": 0.6354345083236694,
+      "rewards/cosine_scaled_reward/mean": -0.07149658352136612,
+      "rewards/cosine_scaled_reward/std": 0.4138363003730774,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1983.0,
+      "completions/mean_length": 1125.484375,
+      "completions/mean_terminated_length": 974.5272216796875,
+      "completions/min_length": 361.0,
+      "completions/min_terminated_length": 361.0,
+      "epoch": 0.15771428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28766506910324097,
+      "learning_rate": 3.45704275117204e-07,
+      "loss": -0.0,
+      "num_tokens": 14047843.0,
+      "reward": 0.8758631944656372,
+      "reward_std": 0.7212573289871216,
+      "rewards/cosine_scaled_reward/mean": -0.05425591766834259,
+      "rewards/cosine_scaled_reward/std": 0.4783853590488434,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2023.0,
+      "completions/mean_length": 1216.171875,
+      "completions/mean_terminated_length": 1160.7166748046875,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "epoch": 0.15885714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2882857024669647,
+      "learning_rate": 3.387377967463493e-07,
+      "loss": -0.0,
+      "num_tokens": 14136318.0,
+      "reward": 0.7189284563064575,
+      "reward_std": 0.4593912959098816,
+      "rewards/cosine_scaled_reward/mean": -0.13272328674793243,
+      "rewards/cosine_scaled_reward/std": 0.33584704995155334,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2017.0,
+      "completions/mean_length": 1142.140625,
+      "completions/mean_terminated_length": 1012.732177734375,
+      "completions/min_length": 389.0,
+      "completions/min_terminated_length": 389.0,
+      "epoch": 0.16,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.3000667095184326,
+      "learning_rate": 3.3183567088914833e-07,
+      "loss": 0.0,
+      "num_tokens": 14219639.0,
+      "reward": 0.8278639316558838,
+      "reward_std": 0.46724599599838257,
+      "rewards/cosine_scaled_reward/mean": -0.03919300064444542,
+      "rewards/cosine_scaled_reward/std": 0.4650508463382721,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1919.0,
+      "completions/mean_length": 1025.421875,
+      "completions/mean_terminated_length": 975.131103515625,
+      "completions/min_length": 394.0,
+      "completions/min_terminated_length": 394.0,
+      "epoch": 0.16114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3207882046699524,
+      "learning_rate": 3.250000000000001e-07,
+      "loss": 0.0,
+      "num_tokens": 14295826.0,
+      "reward": 0.8871637582778931,
+      "reward_std": 0.6538586616516113,
+      "rewards/cosine_scaled_reward/mean": -0.04079316183924675,
+      "rewards/cosine_scaled_reward/std": 0.43451616168022156,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1984.0,
+      "completions/mean_length": 1233.90625,
+      "completions/mean_terminated_length": 1149.689697265625,
+      "completions/min_length": 278.0,
+      "completions/min_terminated_length": 278.0,
+      "epoch": 0.16228571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3009903132915497,
+      "learning_rate": 3.182328662904756e-07,
+      "loss": 0.0,
+      "num_tokens": 14385300.0,
+      "reward": 0.8573208451271057,
+      "reward_std": 0.6099269390106201,
+      "rewards/cosine_scaled_reward/mean": -0.055714573711156845,
+      "rewards/cosine_scaled_reward/std": 0.43728360533714294,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1946.0,
+      "completions/mean_length": 1136.078125,
+      "completions/mean_terminated_length": 1005.8035888671875,
+      "completions/min_length": 415.0,
+      "completions/min_terminated_length": 415.0,
+      "epoch": 0.16342857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31794917583465576,
+      "learning_rate": 3.115363310950578e-07,
+      "loss": 0.0,
+      "num_tokens": 14468825.0,
+      "reward": 0.6553314924240112,
+      "reward_std": 0.6344339847564697,
+      "rewards/cosine_scaled_reward/mean": -0.11764675378799438,
+      "rewards/cosine_scaled_reward/std": 0.3099633455276489,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1947.0,
+      "completions/mean_length": 1220.6875,
+      "completions/mean_terminated_length": 1029.769287109375,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 0.16457142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3814108967781067,
+      "learning_rate": 3.0491243424323783e-07,
+      "loss": 0.0,
+      "num_tokens": 14558437.0,
+      "reward": 0.7285318970680237,
+      "reward_std": 0.8925961256027222,
+      "rewards/cosine_scaled_reward/mean": -0.05760904401540756,
+      "rewards/cosine_scaled_reward/std": 0.492266446352005,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1880.0,
+      "completions/mean_length": 969.796875,
+      "completions/mean_terminated_length": 916.7704467773438,
+      "completions/min_length": 275.0,
+      "completions/min_terminated_length": 275.0,
+      "epoch": 0.1657142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3201180398464203,
+      "learning_rate": 2.9836319343816397e-07,
+      "loss": -0.0,
+      "num_tokens": 14630448.0,
+      "reward": 0.8149441480636597,
+      "reward_std": 0.5824600458145142,
+      "rewards/cosine_scaled_reward/mean": -0.08471541851758957,
+      "rewards/cosine_scaled_reward/std": 0.475755512714386,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1943.0,
+      "completions/mean_length": 1034.484375,
+      "completions/mean_terminated_length": 966.9166870117188,
+      "completions/min_length": 482.0,
+      "completions/min_terminated_length": 482.0,
+      "epoch": 0.16685714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28184273838996887,
+      "learning_rate": 2.918906036420294e-07,
+      "loss": -0.0,
+      "num_tokens": 14707271.0,
+      "reward": 0.8387603759765625,
+      "reward_std": 0.5346506237983704,
+      "rewards/cosine_scaled_reward/mean": -0.07280732691287994,
+      "rewards/cosine_scaled_reward/std": 0.43024110794067383,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1980.0,
+      "completions/mean_length": 1249.984375,
+      "completions/mean_terminated_length": 1046.568603515625,
+      "completions/min_length": 550.0,
+      "completions/min_terminated_length": 550.0,
+      "epoch": 0.168,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.32145801186561584,
+      "learning_rate": 2.854966364683872e-07,
+      "loss": 0.0,
+      "num_tokens": 14798054.0,
+      "reward": 0.7505484819412231,
+      "reward_std": 0.5473448634147644,
+      "rewards/cosine_scaled_reward/mean": -0.07003828883171082,
+      "rewards/cosine_scaled_reward/std": 0.4046306014060974,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1844.0,
+      "completions/mean_length": 1062.828125,
+      "completions/mean_terminated_length": 960.913818359375,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "epoch": 0.16914285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2667451500892639,
+      "learning_rate": 2.791832395815782e-07,
+      "loss": -0.0,
+      "num_tokens": 14877259.0,
+      "reward": 0.7823130488395691,
+      "reward_std": 0.48230016231536865,
+      "rewards/cosine_scaled_reward/mean": -0.06978099048137665,
+      "rewards/cosine_scaled_reward/std": 0.37567150592803955,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2020.0,
+      "completions/mean_length": 1386.875,
+      "completions/mean_terminated_length": 1086.3636474609375,
+      "completions/min_length": 439.0,
+      "completions/min_terminated_length": 439.0,
+      "epoch": 0.1702857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2730913758277893,
+      "learning_rate": 2.729523361034538e-07,
+      "loss": 0.0,
+      "num_tokens": 14977915.0,
+      "reward": 0.48214927315711975,
+      "reward_std": 0.8376681804656982,
+      "rewards/cosine_scaled_reward/mean": -0.14173786342144012,
+      "rewards/cosine_scaled_reward/std": 0.4272434711456299,
+      "rewards/format_reward/mean": 0.765625,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1831.0,
+      "completions/mean_length": 994.15625,
+      "completions/mean_terminated_length": 942.3278198242188,
+      "completions/min_length": 322.0,
+      "completions/min_terminated_length": 322.0,
+      "epoch": 0.17142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2946690022945404,
+      "learning_rate": 2.6680582402757324e-07,
+      "loss": -0.0,
+      "num_tokens": 15052045.0,
+      "reward": 0.8893749713897705,
+      "reward_std": 0.7130615711212158,
+      "rewards/cosine_scaled_reward/mean": -0.05531252920627594,
+      "rewards/cosine_scaled_reward/std": 0.4389563202857971,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 150
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 200,
+  "num_input_tokens_seen": 15052045,
+  "num_train_epochs": 1,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-150/training_args.bin b/checkpoint-150/training_args.bin
new file mode 100644
index 0000000..9e03ee7
--- /dev/null
+++ b/checkpoint-150/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec4fbe7e29ae78abab9c9fd5555870c1bffe7656ceef4ac5fa2148a15b61b1e3
+size 8888
diff --git a/checkpoint-150/zero_to_fp32.py b/checkpoint-150/zero_to_fp32.py
new file mode 100644
index 0000000..0e75914
--- /dev/null
+++ b/checkpoint-150/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-200/config.json b/checkpoint-200/config.json
new file mode 100644
index 0000000..78fed5b
--- /dev/null
+++ b/checkpoint-200/config.json
@@ -0,0 +1,29 @@
+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000,
+  "sliding_window": 4096,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": false,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
diff --git a/checkpoint-200/generation_config.json b/checkpoint-200/generation_config.json
new file mode 100644
index 0000000..92878bd
--- /dev/null
+++ b/checkpoint-200/generation_config.json
@@ -0,0 +1,9 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 151646,
+  "do_sample": true,
+  "eos_token_id": 151643,
+  "temperature": 0.6,
+  "top_p": 0.95,
+  "transformers_version": "4.51.3"
+}
diff --git a/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000..417b7cd
--- /dev/null
+++ b/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d31f8d912d8556b877b73713942a1ef3d16f102e8e4af8bc8ca1b2cd30ee2e32
+size 5331274140
diff --git a/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000..2f9cc40
--- /dev/null
+++ b/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92dffb8fd9dea596337dd9e4424b7befb9d707ee8ec9e2114e09d35a9c62619d
+size 5331276572
diff --git a/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000..5454dd2
--- /dev/null
+++ b/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5be5a1e6f75a65d11bcbfdaba184b16fc04b2e29d0eb18df1d50df736d8c195f
+size 5331276892
diff --git a/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000..ea9dfd5
--- /dev/null
+++ b/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a9b594b87c6bd7cee5405dfdf7461ad4ddfd8bece628fc8757a62d5ac4149c7
+size 5331273884
diff --git a/checkpoint-200/global_step200/mp_rank_00_model_states.pt b/checkpoint-200/global_step200/mp_rank_00_model_states.pt
new file mode 100644
index 0000000..7c44f9f
--- /dev/null
+++ b/checkpoint-200/global_step200/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a0586a5a489ebb43c544e6b0f40d6faf0fcfc162cfaad7b8d98f2f24c647cef
+size 3554267640
diff --git a/checkpoint-200/latest b/checkpoint-200/latest
new file mode 100644
index 0000000..753e24e
--- /dev/null
+++ b/checkpoint-200/latest
@@ -0,0 +1 @@
+global_step200
\ No newline at end of file
diff --git a/checkpoint-200/model.safetensors b/checkpoint-200/model.safetensors
new file mode 100644
index 0000000..24c1613
--- /dev/null
+++ b/checkpoint-200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:721e7cd7a52fbe85031e588ef9dd53b84820dc30295efc7a202ec5bf16e6a44d
+size 3554214752
diff --git a/checkpoint-200/rng_state_0.pth b/checkpoint-200/rng_state_0.pth
new file mode 100644
index 0000000..86f2a23
--- /dev/null
+++ b/checkpoint-200/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d27bca98fc43661d89f342b159db8ba39985151ad393ad050976214ea15c356
+size 14960
diff --git a/checkpoint-200/rng_state_1.pth b/checkpoint-200/rng_state_1.pth
new file mode 100644
index 0000000..cb38291
--- /dev/null
+++ b/checkpoint-200/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:045043b7647cb23fd0c5f157aeab975633f4988068c700133c9e818bd7d23acc
+size 14960
diff --git a/checkpoint-200/rng_state_2.pth b/checkpoint-200/rng_state_2.pth
new file mode 100644
index 0000000..ef24bc9
--- /dev/null
+++ b/checkpoint-200/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75b318b1607e2d2d058b7cae62ad715b10b7734cab165019ee7faeb90fa8f9cb
+size 14960
diff --git a/checkpoint-200/rng_state_3.pth b/checkpoint-200/rng_state_3.pth
new file mode 100644
index 0000000..2df3ea6
--- /dev/null
+++ b/checkpoint-200/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0bd846d0d4a459d36cd4ee58b207443fa2c1b79a1c4a8df8fedfff7f31c370c
+size 14960
diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt
new file mode 100644
index 0000000..fdb5a84
--- /dev/null
+++ b/checkpoint-200/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:729ccc5c1effddf89d086a25cf24ed6a75e431dc3254d66b666c6a9c32393455
+size 1064
diff --git a/checkpoint-200/special_tokens_map.json b/checkpoint-200/special_tokens_map.json
new file mode 100644
index 0000000..1d385d6
--- /dev/null
+++ b/checkpoint-200/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/checkpoint-200/tokenizer.json b/checkpoint-200/tokenizer.json
new file mode 100644
index 0000000..e7cd2c1
--- /dev/null
+++ b/checkpoint-200/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a
+size 11422959
diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json
new file mode 100644
index 0000000..ef6e98c
--- /dev/null
+++ b/checkpoint-200/tokenizer_config.json
@@ -0,0 +1,195 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\\n'}}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json
new file mode 100644
index 0000000..dd9a905
--- /dev/null
+++ b/checkpoint-200/trainer_state.json
@@ -0,0 +1,5434 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.22857142857142856,
+  "eval_steps": 500,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1734.0,
+      "completions/mean_length": 1702.03125,
+      "completions/mean_terminated_length": 993.6190795898438,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "epoch": 0.001142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2544386684894562,
+      "learning_rate": 0.0,
+      "loss": -0.0,
+      "num_tokens": 118418.0,
+      "reward": 0.17899775505065918,
+      "reward_std": 0.7650213241577148,
+      "rewards/cosine_scaled_reward/mean": -0.09800112992525101,
+      "rewards/cosine_scaled_reward/std": 0.37953105568885803,
+      "rewards/format_reward/mean": 0.375,
+      "rewards/format_reward/std": 0.48795005679130554,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1894.0,
+      "completions/mean_length": 1738.90625,
+      "completions/mean_terminated_length": 949.0,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "epoch": 0.002285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2436082512140274,
+      "learning_rate": 5e-08,
+      "loss": -0.0,
+      "num_tokens": 239748.0,
+      "reward": 0.3848632574081421,
+      "reward_std": 0.9111153483390808,
+      "rewards/cosine_scaled_reward/mean": 0.020556632429361343,
+      "rewards/cosine_scaled_reward/std": 0.4492928683757782,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1545.0,
+      "completions/mean_length": 1989.015625,
+      "completions/mean_terminated_length": 1104.25,
+      "completions/min_length": 706.0,
+      "completions/min_terminated_length": 706.0,
+      "epoch": 0.0034285714285714284,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2544717788696289,
+      "learning_rate": 1e-07,
+      "loss": -0.0,
+      "num_tokens": 377517.0,
+      "reward": -0.3279358148574829,
+      "reward_std": 0.33216947317123413,
+      "rewards/cosine_scaled_reward/mean": -0.20303040742874146,
+      "rewards/cosine_scaled_reward/std": 0.179075226187706,
+      "rewards/format_reward/mean": 0.078125,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2033.0,
+      "completions/mean_length": 1566.421875,
+      "completions/mean_terminated_length": 1084.84375,
+      "completions/min_length": 502.0,
+      "completions/min_terminated_length": 502.0,
+      "epoch": 0.004571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28807103633880615,
+      "learning_rate": 1.5e-07,
+      "loss": -0.0,
+      "num_tokens": 487576.0,
+      "reward": 0.2716121971607208,
+      "reward_std": 0.6643469333648682,
+      "rewards/cosine_scaled_reward/mean": -0.12981891632080078,
+      "rewards/cosine_scaled_reward/std": 0.3019586503505707,
+      "rewards/format_reward/mean": 0.53125,
+      "rewards/format_reward/std": 0.5029674172401428,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.890625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1807.0,
+      "completions/mean_length": 1936.84375,
+      "completions/mean_terminated_length": 1031.71435546875,
+      "completions/min_length": 463.0,
+      "completions/min_terminated_length": 463.0,
+      "epoch": 0.005714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26783761382102966,
+      "learning_rate": 2e-07,
+      "loss": -0.0,
+      "num_tokens": 622350.0,
+      "reward": -0.3612896800041199,
+      "reward_std": 0.41048353910446167,
+      "rewards/cosine_scaled_reward/mean": -0.23533234000205994,
+      "rewards/cosine_scaled_reward/std": 0.20467400550842285,
+      "rewards/format_reward/mean": 0.109375,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1301.0,
+      "completions/mean_length": 1889.453125,
+      "completions/mean_terminated_length": 779.625,
+      "completions/min_length": 530.0,
+      "completions/min_terminated_length": 530.0,
+      "epoch": 0.006857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.262518972158432,
+      "learning_rate": 2.5e-07,
+      "loss": 0.0,
+      "num_tokens": 754923.0,
+      "reward": -0.29250282049179077,
+      "reward_std": 0.5422531962394714,
+      "rewards/cosine_scaled_reward/mean": -0.22437641024589539,
+      "rewards/cosine_scaled_reward/std": 0.22509199380874634,
+      "rewards/format_reward/mean": 0.15625,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1568.0,
+      "completions/mean_length": 1921.921875,
+      "completions/mean_terminated_length": 1314.45458984375,
+      "completions/min_length": 927.0,
+      "completions/min_terminated_length": 927.0,
+      "epoch": 0.008,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22601397335529327,
+      "learning_rate": 3e-07,
+      "loss": 0.0,
+      "num_tokens": 888334.0,
+      "reward": 0.025340259075164795,
+      "reward_std": 0.7285393476486206,
+      "rewards/cosine_scaled_reward/mean": -0.1279548704624176,
+      "rewards/cosine_scaled_reward/std": 0.40222346782684326,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2009.0,
+      "completions/mean_length": 1736.859375,
+      "completions/mean_terminated_length": 999.9473876953125,
+      "completions/min_length": 305.0,
+      "completions/min_terminated_length": 305.0,
+      "epoch": 0.009142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24552854895591736,
+      "learning_rate": 3.5e-07,
+      "loss": 0.0,
+      "num_tokens": 1009909.0,
+      "reward": 0.21729671955108643,
+      "reward_std": 0.6989120244979858,
+      "rewards/cosine_scaled_reward/mean": -0.055414143949747086,
+      "rewards/cosine_scaled_reward/std": 0.47493892908096313,
+      "rewards/format_reward/mean": 0.328125,
+      "rewards/format_reward/std": 0.4732423722743988,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1963.0,
+      "completions/mean_length": 1967.53125,
+      "completions/mean_terminated_length": 1475.77783203125,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "epoch": 0.010285714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2430322915315628,
+      "learning_rate": 4e-07,
+      "loss": 0.0,
+      "num_tokens": 1147287.0,
+      "reward": -0.21451422572135925,
+      "reward_std": 0.587526798248291,
+      "rewards/cosine_scaled_reward/mean": -0.19319462776184082,
+      "rewards/cosine_scaled_reward/std": 0.29357606172561646,
+      "rewards/format_reward/mean": 0.171875,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1966.0,
+      "completions/mean_length": 1708.546875,
+      "completions/mean_terminated_length": 961.75,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "epoch": 0.011428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2543582320213318,
+      "learning_rate": 4.5e-07,
+      "loss": 0.0,
+      "num_tokens": 1267466.0,
+      "reward": 0.02539752423763275,
+      "reward_std": 0.545810341835022,
+      "rewards/cosine_scaled_reward/mean": -0.14355123043060303,
+      "rewards/cosine_scaled_reward/std": 0.36147356033325195,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.90625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1579.0,
+      "completions/mean_length": 1967.734375,
+      "completions/mean_terminated_length": 1191.8333740234375,
+      "completions/min_length": 843.0,
+      "completions/min_terminated_length": 843.0,
+      "epoch": 0.012571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24583907425403595,
+      "learning_rate": 5e-07,
+      "loss": -0.0,
+      "num_tokens": 1405073.0,
+      "reward": -0.46971434354782104,
+      "reward_std": 0.36104393005371094,
+      "rewards/cosine_scaled_reward/mean": -0.28173214197158813,
+      "rewards/cosine_scaled_reward/std": 0.17775526642799377,
+      "rewards/format_reward/mean": 0.09375,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1862.0,
+      "completions/mean_length": 1707.5625,
+      "completions/mean_terminated_length": 1176.47998046875,
+      "completions/min_length": 330.0,
+      "completions/min_terminated_length": 330.0,
+      "epoch": 0.013714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3135142922401428,
+      "learning_rate": 5.5e-07,
+      "loss": -0.0,
+      "num_tokens": 1525301.0,
+      "reward": 0.0018395520746707916,
+      "reward_std": 0.7012988328933716,
+      "rewards/cosine_scaled_reward/mean": -0.21783021092414856,
+      "rewards/cosine_scaled_reward/std": 0.324150949716568,
+      "rewards/format_reward/mean": 0.4375,
+      "rewards/format_reward/std": 0.5,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1745.0,
+      "completions/mean_length": 1841.96875,
+      "completions/mean_terminated_length": 1168.933349609375,
+      "completions/min_length": 442.0,
+      "completions/min_terminated_length": 442.0,
+      "epoch": 0.014857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2532394826412201,
+      "learning_rate": 6e-07,
+      "loss": -0.0,
+      "num_tokens": 1654227.0,
+      "reward": -0.10322706401348114,
+      "reward_std": 0.6915165185928345,
+      "rewards/cosine_scaled_reward/mean": -0.17661353945732117,
+      "rewards/cosine_scaled_reward/std": 0.329875111579895,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2017.0,
+      "completions/mean_length": 1816.390625,
+      "completions/mean_terminated_length": 1306.8499755859375,
+      "completions/min_length": 520.0,
+      "completions/min_terminated_length": 520.0,
+      "epoch": 0.016,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28405147790908813,
+      "learning_rate": 6.5e-07,
+      "loss": 0.0,
+      "num_tokens": 1781084.0,
+      "reward": 0.10602855682373047,
+      "reward_std": 0.630502462387085,
+      "rewards/cosine_scaled_reward/mean": -0.11104822158813477,
+      "rewards/cosine_scaled_reward/std": 0.3846627473831177,
+      "rewards/format_reward/mean": 0.328125,
+      "rewards/format_reward/std": 0.4732423722743988,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1574.0,
+      "completions/mean_length": 1702.109375,
+      "completions/mean_terminated_length": 818.1666870117188,
+      "completions/min_length": 411.0,
+      "completions/min_terminated_length": 411.0,
+      "epoch": 0.017142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28779250383377075,
+      "learning_rate": 7e-07,
+      "loss": 0.0,
+      "num_tokens": 1900939.0,
+      "reward": 0.32734519243240356,
+      "reward_std": 0.3870265483856201,
+      "rewards/cosine_scaled_reward/mean": 0.007422588765621185,
+      "rewards/cosine_scaled_reward/std": 0.45787373185157776,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 2048.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 2048.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.018285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2337152510881424,
+      "learning_rate": 7.5e-07,
+      "loss": -0.0,
+      "num_tokens": 2042451.0,
+      "reward": -0.5429925918579102,
+      "reward_std": 0.3153150975704193,
+      "rewards/cosine_scaled_reward/mean": -0.2714962661266327,
+      "rewards/cosine_scaled_reward/std": 0.1678173691034317,
+      "rewards/format_reward/mean": 0.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1879.0,
+      "completions/mean_length": 1564.921875,
+      "completions/mean_terminated_length": 858.8846435546875,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "epoch": 0.019428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33599403500556946,
+      "learning_rate": 8e-07,
+      "loss": -0.0,
+      "num_tokens": 2153126.0,
+      "reward": 0.17696775496006012,
+      "reward_std": 0.6489306688308716,
+      "rewards/cosine_scaled_reward/mean": -0.11464111506938934,
+      "rewards/cosine_scaled_reward/std": 0.3551919758319855,
+      "rewards/format_reward/mean": 0.40625,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1298.0,
+      "completions/mean_length": 1795.390625,
+      "completions/mean_terminated_length": 893.21435546875,
+      "completions/min_length": 619.0,
+      "completions/min_terminated_length": 619.0,
+      "epoch": 0.02057142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22697053849697113,
+      "learning_rate": 8.499999999999999e-07,
+      "loss": -0.0,
+      "num_tokens": 2278407.0,
+      "reward": -0.10711958259344101,
+      "reward_std": 0.5238703489303589,
+      "rewards/cosine_scaled_reward/mean": -0.1785597801208496,
+      "rewards/cosine_scaled_reward/std": 0.2545098662376404,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1949.0,
+      "completions/mean_length": 1921.484375,
+      "completions/mean_terminated_length": 1238.300048828125,
+      "completions/min_length": 623.0,
+      "completions/min_terminated_length": 623.0,
+      "epoch": 0.021714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23972108960151672,
+      "learning_rate": 9e-07,
+      "loss": 0.0,
+      "num_tokens": 2412638.0,
+      "reward": 0.029344379901885986,
+      "reward_std": 0.6719281077384949,
+      "rewards/cosine_scaled_reward/mean": -0.086890310049057,
+      "rewards/cosine_scaled_reward/std": 0.40220555663108826,
+      "rewards/format_reward/mean": 0.203125,
+      "rewards/format_reward/std": 0.40550529956817627,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2025.0,
+      "completions/mean_length": 1728.5625,
+      "completions/mean_terminated_length": 845.4117431640625,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "epoch": 0.022857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23309311270713806,
+      "learning_rate": 9.499999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 2534618.0,
+      "reward": 0.0131673663854599,
+      "reward_std": 0.4436222314834595,
+      "rewards/cosine_scaled_reward/mean": -0.13404130935668945,
+      "rewards/cosine_scaled_reward/std": 0.32819250226020813,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1923.0,
+      "completions/mean_length": 1777.953125,
+      "completions/mean_terminated_length": 1087.8333740234375,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "epoch": 0.024,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29990270733833313,
+      "learning_rate": 1e-06,
+      "loss": -0.0,
+      "num_tokens": 2659215.0,
+      "reward": -0.1764472872018814,
+      "reward_std": 0.5121938586235046,
+      "rewards/cosine_scaled_reward/mean": -0.2444736361503601,
+      "rewards/cosine_scaled_reward/std": 0.289971262216568,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.390625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1935.0,
+      "completions/mean_length": 1361.28125,
+      "completions/mean_terminated_length": 921.0769653320312,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 0.025142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29922786355018616,
+      "learning_rate": 9.99931462820376e-07,
+      "loss": -0.0,
+      "num_tokens": 2755353.0,
+      "reward": 0.6089149713516235,
+      "reward_std": 0.5986809730529785,
+      "rewards/cosine_scaled_reward/mean": -0.05491749942302704,
+      "rewards/cosine_scaled_reward/std": 0.39076483249664307,
+      "rewards/format_reward/mean": 0.71875,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2043.0,
+      "completions/mean_length": 1565.046875,
+      "completions/mean_terminated_length": 903.2222290039062,
+      "completions/min_length": 405.0,
+      "completions/min_terminated_length": 405.0,
+      "epoch": 0.026285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27512773871421814,
+      "learning_rate": 9.997258721585931e-07,
+      "loss": -0.0,
+      "num_tokens": 2866308.0,
+      "reward": 0.21871733665466309,
+      "reward_std": 0.5976030826568604,
+      "rewards/cosine_scaled_reward/mean": -0.10157884657382965,
+      "rewards/cosine_scaled_reward/std": 0.3856185972690582,
+      "rewards/format_reward/mean": 0.421875,
+      "rewards/format_reward/std": 0.49776285886764526,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1947.0,
+      "completions/mean_length": 1801.671875,
+      "completions/mean_terminated_length": 1259.75,
+      "completions/min_length": 573.0,
+      "completions/min_terminated_length": 573.0,
+      "epoch": 0.027428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22642865777015686,
+      "learning_rate": 9.993832906395582e-07,
+      "loss": -0.0,
+      "num_tokens": 2992543.0,
+      "reward": 0.04899948835372925,
+      "reward_std": 0.8525694608688354,
+      "rewards/cosine_scaled_reward/mean": -0.17081275582313538,
+      "rewards/cosine_scaled_reward/std": 0.3993513882160187,
+      "rewards/format_reward/mean": 0.390625,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2020.0,
+      "completions/mean_length": 1715.765625,
+      "completions/mean_terminated_length": 1035.4761962890625,
+      "completions/min_length": 436.0,
+      "completions/min_terminated_length": 436.0,
+      "epoch": 0.02857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25316134095191956,
+      "learning_rate": 9.989038226169207e-07,
+      "loss": -0.0,
+      "num_tokens": 3112648.0,
+      "reward": 0.10585837811231613,
+      "reward_std": 0.7828943729400635,
+      "rewards/cosine_scaled_reward/mean": -0.11894579976797104,
+      "rewards/cosine_scaled_reward/std": 0.4141720235347748,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1964.0,
+      "completions/mean_length": 1917.703125,
+      "completions/mean_terminated_length": 1452.357177734375,
+      "completions/min_length": 840.0,
+      "completions/min_terminated_length": 840.0,
+      "epoch": 0.029714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2521306574344635,
+      "learning_rate": 9.982876141412855e-07,
+      "loss": -0.0,
+      "num_tokens": 3246013.0,
+      "reward": 0.17620250582695007,
+      "reward_std": 0.6548349857330322,
+      "rewards/cosine_scaled_reward/mean": -0.08377375453710556,
+      "rewards/cosine_scaled_reward/std": 0.3527655303478241,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1990.0,
+      "completions/mean_length": 1851.015625,
+      "completions/mean_terminated_length": 1147.5,
+      "completions/min_length": 506.0,
+      "completions/min_terminated_length": 506.0,
+      "epoch": 0.030857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2730060815811157,
+      "learning_rate": 9.975348529157229e-07,
+      "loss": -0.0,
+      "num_tokens": 3374766.0,
+      "reward": -0.18854813277721405,
+      "reward_std": 0.49348777532577515,
+      "rewards/cosine_scaled_reward/mean": -0.21146157383918762,
+      "rewards/cosine_scaled_reward/std": 0.2601618766784668,
+      "rewards/format_reward/mean": 0.234375,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2023.0,
+      "completions/mean_length": 1798.328125,
+      "completions/mean_terminated_length": 1049.3125,
+      "completions/min_length": 403.0,
+      "completions/min_terminated_length": 403.0,
+      "epoch": 0.032,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2566036880016327,
+      "learning_rate": 9.96645768238595e-07,
+      "loss": 0.0,
+      "num_tokens": 3500195.0,
+      "reward": 0.06705980002880096,
+      "reward_std": 0.7090284824371338,
+      "rewards/cosine_scaled_reward/mean": -0.10709509253501892,
+      "rewards/cosine_scaled_reward/std": 0.4101051986217499,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1988.0,
+      "completions/mean_length": 1930.203125,
+      "completions/mean_terminated_length": 1210.3333740234375,
+      "completions/min_length": 582.0,
+      "completions/min_terminated_length": 582.0,
+      "epoch": 0.03314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25197461247444153,
+      "learning_rate": 9.956206309337066e-07,
+      "loss": 0.0,
+      "num_tokens": 3634200.0,
+      "reward": -0.2462695688009262,
+      "reward_std": 0.5237302780151367,
+      "rewards/cosine_scaled_reward/mean": -0.2012597918510437,
+      "rewards/cosine_scaled_reward/std": 0.23252712190151215,
+      "rewards/format_reward/mean": 0.15625,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1900.0,
+      "completions/mean_length": 1847.65625,
+      "completions/mean_terminated_length": 1061.6923828125,
+      "completions/min_length": 421.0,
+      "completions/min_terminated_length": 421.0,
+      "epoch": 0.03428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30431485176086426,
+      "learning_rate": 9.944597532678119e-07,
+      "loss": 0.0,
+      "num_tokens": 3762986.0,
+      "reward": -0.05392302945256233,
+      "reward_std": 0.7249555587768555,
+      "rewards/cosine_scaled_reward/mean": -0.15196150541305542,
+      "rewards/cosine_scaled_reward/std": 0.34566983580589294,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1860.0,
+      "completions/mean_length": 1838.671875,
+      "completions/mean_terminated_length": 931.5833740234375,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "epoch": 0.03542857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2484513372182846,
+      "learning_rate": 9.931634888554935e-07,
+      "loss": 0.0,
+      "num_tokens": 3891157.0,
+      "reward": -0.11271396279335022,
+      "reward_std": 0.6705260872840881,
+      "rewards/cosine_scaled_reward/mean": -0.1813569962978363,
+      "rewards/cosine_scaled_reward/std": 0.4071698486804962,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1715.0,
+      "completions/mean_length": 1910.109375,
+      "completions/mean_terminated_length": 1417.6429443359375,
+      "completions/min_length": 906.0,
+      "completions/min_terminated_length": 906.0,
+      "epoch": 0.036571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25329527258872986,
+      "learning_rate": 9.917322325514487e-07,
+      "loss": -0.0,
+      "num_tokens": 4023756.0,
+      "reward": -0.08931556344032288,
+      "reward_std": 0.6381070613861084,
+      "rewards/cosine_scaled_reward/mean": -0.16965776681900024,
+      "rewards/cosine_scaled_reward/std": 0.37385129928588867,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.953125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1865.0,
+      "completions/mean_length": 2023.71875,
+      "completions/mean_terminated_length": 1530.0,
+      "completions/min_length": 1107.0,
+      "completions/min_terminated_length": 1107.0,
+      "epoch": 0.037714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22758109867572784,
+      "learning_rate": 9.901664203302124e-07,
+      "loss": 0.0,
+      "num_tokens": 4164490.0,
+      "reward": -0.4589868187904358,
+      "reward_std": 0.5177067518234253,
+      "rewards/cosine_scaled_reward/mean": -0.2919934093952179,
+      "rewards/cosine_scaled_reward/std": 0.2252870500087738,
+      "rewards/format_reward/mean": 0.125,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.453125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1999.0,
+      "completions/mean_length": 1454.78125,
+      "completions/mean_terminated_length": 963.2571411132812,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "epoch": 0.038857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3234354257583618,
+      "learning_rate": 9.88466529153356e-07,
+      "loss": 0.0,
+      "num_tokens": 4267148.0,
+      "reward": 0.656031608581543,
+      "reward_std": 0.7529654502868652,
+      "rewards/cosine_scaled_reward/mean": 0.05457830801606178,
+      "rewards/cosine_scaled_reward/std": 0.49684229493141174,
+      "rewards/format_reward/mean": 0.546875,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1724.0,
+      "completions/mean_length": 1819.078125,
+      "completions/mean_terminated_length": 716.0909423828125,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "epoch": 0.04,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2821458876132965,
+      "learning_rate": 9.866330768241983e-07,
+      "loss": -0.0,
+      "num_tokens": 4395065.0,
+      "reward": -0.09630556404590607,
+      "reward_std": 0.7089139223098755,
+      "rewards/cosine_scaled_reward/mean": -0.15752778947353363,
+      "rewards/cosine_scaled_reward/std": 0.3647947609424591,
+      "rewards/format_reward/mean": 0.21875,
+      "rewards/format_reward/std": 0.4166666865348816,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1811.0,
+      "completions/mean_length": 1954.34375,
+      "completions/mean_terminated_length": 1382.0,
+      "completions/min_length": 949.0,
+      "completions/min_terminated_length": 949.0,
+      "epoch": 0.04114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24163897335529327,
+      "learning_rate": 9.846666218300807e-07,
+      "loss": -0.0,
+      "num_tokens": 4531255.0,
+      "reward": -0.34593287110328674,
+      "reward_std": 0.44493502378463745,
+      "rewards/cosine_scaled_reward/mean": -0.24327893555164337,
+      "rewards/cosine_scaled_reward/std": 0.24784433841705322,
+      "rewards/format_reward/mean": 0.140625,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1723.0,
+      "completions/mean_length": 1868.921875,
+      "completions/mean_terminated_length": 1092.916748046875,
+      "completions/min_length": 620.0,
+      "completions/min_terminated_length": 620.0,
+      "epoch": 0.04228571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24795544147491455,
+      "learning_rate": 9.825677631722435e-07,
+      "loss": -0.0,
+      "num_tokens": 4661890.0,
+      "reward": -0.23053905367851257,
+      "reward_std": 0.34036368131637573,
+      "rewards/cosine_scaled_reward/mean": -0.2246445268392563,
+      "rewards/cosine_scaled_reward/std": 0.15942412614822388,
+      "rewards/format_reward/mean": 0.21875,
+      "rewards/format_reward/std": 0.4166666865348816,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1397.0,
+      "completions/mean_length": 1889.53125,
+      "completions/mean_terminated_length": 1033.800048828125,
+      "completions/min_length": 810.0,
+      "completions/min_terminated_length": 810.0,
+      "epoch": 0.04342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24283826351165771,
+      "learning_rate": 9.80337140183366e-07,
+      "loss": 0.0,
+      "num_tokens": 4794532.0,
+      "reward": -0.10043507814407349,
+      "reward_std": 0.47925832867622375,
+      "rewards/cosine_scaled_reward/mean": -0.13615503907203674,
+      "rewards/cosine_scaled_reward/std": 0.3336707651615143,
+      "rewards/format_reward/mean": 0.171875,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1515.0,
+      "completions/mean_length": 1644.828125,
+      "completions/mean_terminated_length": 689.9473876953125,
+      "completions/min_length": 279.0,
+      "completions/min_terminated_length": 279.0,
+      "epoch": 0.044571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28362998366355896,
+      "learning_rate": 9.779754323328192e-07,
+      "loss": 0.0,
+      "num_tokens": 4910585.0,
+      "reward": 0.12284853309392929,
+      "reward_std": 0.4183085858821869,
+      "rewards/cosine_scaled_reward/mean": -0.11045074462890625,
+      "rewards/cosine_scaled_reward/std": 0.30217844247817993,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1632.0,
+      "completions/mean_length": 1618.28125,
+      "completions/mean_terminated_length": 902.0833740234375,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "epoch": 0.045714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.262617826461792,
+      "learning_rate": 9.754833590196926e-07,
+      "loss": 0.0,
+      "num_tokens": 5024227.0,
+      "reward": 0.2076582908630371,
+      "reward_std": 0.42125773429870605,
+      "rewards/cosine_scaled_reward/mean": -0.12273336946964264,
+      "rewards/cosine_scaled_reward/std": 0.4404613971710205,
+      "rewards/format_reward/mean": 0.453125,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1914.0,
+      "completions/mean_length": 1717.734375,
+      "completions/mean_terminated_length": 1235.0384521484375,
+      "completions/min_length": 664.0,
+      "completions/min_terminated_length": 664.0,
+      "epoch": 0.046857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23294499516487122,
+      "learning_rate": 9.728616793536587e-07,
+      "loss": -0.0,
+      "num_tokens": 5145314.0,
+      "reward": 0.011502981185913086,
+      "reward_std": 0.6816084980964661,
+      "rewards/cosine_scaled_reward/mean": -0.22081100940704346,
+      "rewards/cosine_scaled_reward/std": 0.37589573860168457,
+      "rewards/format_reward/mean": 0.453125,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1672.0,
+      "completions/mean_length": 1703.921875,
+      "completions/mean_terminated_length": 579.933349609375,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 0.048,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34672290086746216,
+      "learning_rate": 9.701111919237408e-07,
+      "loss": -0.0,
+      "num_tokens": 5264725.0,
+      "reward": -0.2616002857685089,
+      "reward_std": 0.37952175736427307,
+      "rewards/cosine_scaled_reward/mean": -0.26361262798309326,
+      "rewards/cosine_scaled_reward/std": 0.17531204223632812,
+      "rewards/format_reward/mean": 0.265625,
+      "rewards/format_reward/std": 0.44515693187713623,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1370.0,
+      "completions/mean_length": 1681.84375,
+      "completions/mean_terminated_length": 814.631591796875,
+      "completions/min_length": 308.0,
+      "completions/min_terminated_length": 308.0,
+      "epoch": 0.04914285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.263967901468277,
+      "learning_rate": 9.672327345550543e-07,
+      "loss": -0.0,
+      "num_tokens": 5383979.0,
+      "reward": 0.13376155495643616,
+      "reward_std": 0.46012288331985474,
+      "rewards/cosine_scaled_reward/mean": -0.08155670762062073,
+      "rewards/cosine_scaled_reward/std": 0.3612325191497803,
+      "rewards/format_reward/mean": 0.296875,
+      "rewards/format_reward/std": 0.4604927599430084,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.640625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1830.0,
+      "completions/mean_length": 1624.625,
+      "completions/mean_terminated_length": 869.9130859375,
+      "completions/min_length": 385.0,
+      "completions/min_terminated_length": 385.0,
+      "epoch": 0.05028571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28927963972091675,
+      "learning_rate": 9.64227184053598e-07,
+      "loss": -0.0,
+      "num_tokens": 5498651.0,
+      "reward": 0.20869271457195282,
+      "reward_std": 0.5558150410652161,
+      "rewards/cosine_scaled_reward/mean": -0.0987786278128624,
+      "rewards/cosine_scaled_reward/std": 0.42912590503692627,
+      "rewards/format_reward/mean": 0.40625,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.921875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1851.0,
+      "completions/mean_length": 2006.96875,
+      "completions/mean_terminated_length": 1522.800048828125,
+      "completions/min_length": 955.0,
+      "completions/min_terminated_length": 955.0,
+      "epoch": 0.05142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24254000186920166,
+      "learning_rate": 9.610954559391704e-07,
+      "loss": 0.0,
+      "num_tokens": 5638753.0,
+      "reward": -0.2540697157382965,
+      "reward_std": 0.4600578844547272,
+      "rewards/cosine_scaled_reward/mean": -0.20515984296798706,
+      "rewards/cosine_scaled_reward/std": 0.3251590430736542,
+      "rewards/format_reward/mean": 0.15625,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1563.0,
+      "completions/mean_length": 1765.984375,
+      "completions/mean_terminated_length": 919.9375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "epoch": 0.052571428571428575,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2645930349826813,
+      "learning_rate": 9.578385041664925e-07,
+      "loss": 0.0,
+      "num_tokens": 5762944.0,
+      "reward": -0.213707834482193,
+      "reward_std": 0.38778313994407654,
+      "rewards/cosine_scaled_reward/mean": -0.2318539321422577,
+      "rewards/cosine_scaled_reward/std": 0.21436986327171326,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1993.0,
+      "completions/mean_length": 1583.40625,
+      "completions/mean_terminated_length": 986.0714721679688,
+      "completions/min_length": 436.0,
+      "completions/min_terminated_length": 436.0,
+      "epoch": 0.053714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.311797559261322,
+      "learning_rate": 9.54457320834625e-07,
+      "loss": 0.0,
+      "num_tokens": 5874682.0,
+      "reward": 0.27925533056259155,
+      "reward_std": 0.6467443704605103,
+      "rewards/cosine_scaled_reward/mean": -0.07912233471870422,
+      "rewards/cosine_scaled_reward/std": 0.4737093150615692,
+      "rewards/format_reward/mean": 0.4375,
+      "rewards/format_reward/std": 0.5,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1527.0,
+      "completions/mean_length": 1690.0625,
+      "completions/mean_terminated_length": 1006.727294921875,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "epoch": 0.054857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26644304394721985,
+      "learning_rate": 9.509529358847654e-07,
+      "loss": -0.0,
+      "num_tokens": 5993390.0,
+      "reward": 0.13692031800746918,
+      "reward_std": 0.5655145049095154,
+      "rewards/cosine_scaled_reward/mean": -0.12685233354568481,
+      "rewards/cosine_scaled_reward/std": 0.32320985198020935,
+      "rewards/format_reward/mean": 0.390625,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2021.0,
+      "completions/mean_length": 1387.140625,
+      "completions/mean_terminated_length": 804.0294189453125,
+      "completions/min_length": 300.0,
+      "completions/min_terminated_length": 300.0,
+      "epoch": 0.056,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3078882396221161,
+      "learning_rate": 9.473264167865171e-07,
+      "loss": 0.0,
+      "num_tokens": 6092231.0,
+      "reward": 0.35559189319610596,
+      "reward_std": 0.5927403569221497,
+      "rewards/cosine_scaled_reward/mean": -0.09564155340194702,
+      "rewards/cosine_scaled_reward/std": 0.4046906530857086,
+      "rewards/format_reward/mean": 0.546875,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1830.0,
+      "completions/mean_length": 1674.890625,
+      "completions/mean_terminated_length": 962.5909423828125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "epoch": 0.05714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23925544321537018,
+      "learning_rate": 9.43578868212728e-07,
+      "loss": -0.0,
+      "num_tokens": 6210240.0,
+      "reward": 0.18573230504989624,
+      "reward_std": 0.5264967083930969,
+      "rewards/cosine_scaled_reward/mean": -0.09463384002447128,
+      "rewards/cosine_scaled_reward/std": 0.4100942015647888,
+      "rewards/format_reward/mean": 0.375,
+      "rewards/format_reward/std": 0.48795005679130554,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.421875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2024.0,
+      "completions/mean_length": 1347.40625,
+      "completions/mean_terminated_length": 836.1621704101562,
+      "completions/min_length": 394.0,
+      "completions/min_terminated_length": 394.0,
+      "epoch": 0.05828571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.325811505317688,
+      "learning_rate": 9.397114317029974e-07,
+      "loss": 0.0,
+      "num_tokens": 6306682.0,
+      "reward": 0.1735648661851883,
+      "reward_std": 0.5335988998413086,
+      "rewards/cosine_scaled_reward/mean": -0.21009255945682526,
+      "rewards/cosine_scaled_reward/std": 0.2623959481716156,
+      "rewards/format_reward/mean": 0.59375,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1390.0,
+      "completions/mean_length": 1727.765625,
+      "completions/mean_terminated_length": 767.0625,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 0.05942857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27392977476119995,
+      "learning_rate": 9.357252853159505e-07,
+      "loss": 0.0,
+      "num_tokens": 6428611.0,
+      "reward": -0.16267812252044678,
+      "reward_std": 0.5682471990585327,
+      "rewards/cosine_scaled_reward/mean": -0.2219640612602234,
+      "rewards/cosine_scaled_reward/std": 0.36739134788513184,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1907.0,
+      "completions/mean_length": 1609.171875,
+      "completions/mean_terminated_length": 924.5999755859375,
+      "completions/min_length": 513.0,
+      "completions/min_terminated_length": 513.0,
+      "epoch": 0.060571428571428575,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28155064582824707,
+      "learning_rate": 9.316216432703916e-07,
+      "loss": -0.0,
+      "num_tokens": 6542430.0,
+      "reward": 0.0752667784690857,
+      "reward_std": 0.7118167281150818,
+      "rewards/cosine_scaled_reward/mean": -0.18892911076545715,
+      "rewards/cosine_scaled_reward/std": 0.3222156763076782,
+      "rewards/format_reward/mean": 0.453125,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1986.0,
+      "completions/mean_length": 1588.234375,
+      "completions/mean_terminated_length": 1067.166748046875,
+      "completions/min_length": 519.0,
+      "completions/min_terminated_length": 519.0,
+      "epoch": 0.061714285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2555343806743622,
+      "learning_rate": 9.274017555754407e-07,
+      "loss": 0.0,
+      "num_tokens": 6655221.0,
+      "reward": 0.6341299414634705,
+      "reward_std": 1.0656921863555908,
+      "rewards/cosine_scaled_reward/mean": 0.05143994837999344,
+      "rewards/cosine_scaled_reward/std": 0.5348308086395264,
+      "rewards/format_reward/mean": 0.53125,
+      "rewards/format_reward/std": 0.5029674172401428,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1420.0,
+      "completions/mean_length": 1549.5625,
+      "completions/mean_terminated_length": 821.0769653320312,
+      "completions/min_length": 444.0,
+      "completions/min_terminated_length": 444.0,
+      "epoch": 0.06285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30243629217147827,
+      "learning_rate": 9.230669076497687e-07,
+      "loss": -0.0,
+      "num_tokens": 6764681.0,
+      "reward": 0.13021975755691528,
+      "reward_std": 0.3984764516353607,
+      "rewards/cosine_scaled_reward/mean": -0.13801513612270355,
+      "rewards/cosine_scaled_reward/std": 0.41228073835372925,
+      "rewards/format_reward/mean": 0.40625,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.546875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2040.0,
+      "completions/mean_length": 1633.25,
+      "completions/mean_terminated_length": 1132.689697265625,
+      "completions/min_length": 543.0,
+      "completions/min_terminated_length": 543.0,
+      "epoch": 0.064,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23835402727127075,
+      "learning_rate": 9.186184199300463e-07,
+      "loss": -0.0,
+      "num_tokens": 6880169.0,
+      "reward": 0.27981996536254883,
+      "reward_std": 0.5018116235733032,
+      "rewards/cosine_scaled_reward/mean": -0.10227750986814499,
+      "rewards/cosine_scaled_reward/std": 0.481824666261673,
+      "rewards/format_reward/mean": 0.484375,
+      "rewards/format_reward/std": 0.5037065148353577,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1807.0,
+      "completions/mean_length": 1699.875,
+      "completions/mean_terminated_length": 1156.7999267578125,
+      "completions/min_length": 642.0,
+      "completions/min_terminated_length": 642.0,
+      "epoch": 0.06514285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22349494695663452,
+      "learning_rate": 9.140576474687263e-07,
+      "loss": 0.0,
+      "num_tokens": 7000529.0,
+      "reward": -0.026505012065172195,
+      "reward_std": 0.5785415172576904,
+      "rewards/cosine_scaled_reward/mean": -0.20856501162052155,
+      "rewards/cosine_scaled_reward/std": 0.2749907374382019,
+      "rewards/format_reward/mean": 0.390625,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2044.0,
+      "completions/mean_length": 1457.875,
+      "completions/mean_terminated_length": 1054.105224609375,
+      "completions/min_length": 447.0,
+      "completions/min_terminated_length": 447.0,
+      "epoch": 0.06628571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.261942595243454,
+      "learning_rate": 9.093859795212817e-07,
+      "loss": 0.0,
+      "num_tokens": 7103929.0,
+      "reward": 0.5745843648910522,
+      "reward_std": 0.8671218156814575,
+      "rewards/cosine_scaled_reward/mean": -0.03302033245563507,
+      "rewards/cosine_scaled_reward/std": 0.45529407262802124,
+      "rewards/format_reward/mean": 0.640625,
+      "rewards/format_reward/std": 0.4836103618144989,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.484375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2010.0,
+      "completions/mean_length": 1590.0625,
+      "completions/mean_terminated_length": 1159.8787841796875,
+      "completions/min_length": 591.0,
+      "completions/min_terminated_length": 591.0,
+      "epoch": 0.06742857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24828943610191345,
+      "learning_rate": 9.046048391230247e-07,
+      "loss": -0.0,
+      "num_tokens": 7216157.0,
+      "reward": 0.3377103805541992,
+      "reward_std": 0.5543617010116577,
+      "rewards/cosine_scaled_reward/mean": -0.1045822948217392,
+      "rewards/cosine_scaled_reward/std": 0.39040952920913696,
+      "rewards/format_reward/mean": 0.546875,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1976.0,
+      "completions/mean_length": 1622.84375,
+      "completions/mean_terminated_length": 1076.21435546875,
+      "completions/min_length": 555.0,
+      "completions/min_terminated_length": 555.0,
+      "epoch": 0.06857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2752656936645508,
+      "learning_rate": 8.997156826556369e-07,
+      "loss": -0.0,
+      "num_tokens": 7330907.0,
+      "reward": 0.11114693433046341,
+      "reward_std": 0.6926254034042358,
+      "rewards/cosine_scaled_reward/mean": -0.1788015365600586,
+      "rewards/cosine_scaled_reward/std": 0.39409172534942627,
+      "rewards/format_reward/mean": 0.46875,
+      "rewards/format_reward/std": 0.5029674172401428,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1984.0,
+      "completions/mean_length": 1708.859375,
+      "completions/mean_terminated_length": 1014.4285888671875,
+      "completions/min_length": 411.0,
+      "completions/min_terminated_length": 411.0,
+      "epoch": 0.06971428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22669929265975952,
+      "learning_rate": 8.9471999940354e-07,
+      "loss": -0.0,
+      "num_tokens": 7451794.0,
+      "reward": 0.2345120906829834,
+      "reward_std": 0.6293160319328308,
+      "rewards/cosine_scaled_reward/mean": -0.1093064472079277,
+      "rewards/cosine_scaled_reward/std": 0.29189831018447876,
+      "rewards/format_reward/mean": 0.453125,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2008.0,
+      "completions/mean_length": 1281.53125,
+      "completions/mean_terminated_length": 1004.2978515625,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "epoch": 0.07085714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25438693165779114,
+      "learning_rate": 8.896193111002475e-07,
+      "loss": 0.0,
+      "num_tokens": 7544044.0,
+      "reward": 0.9180847406387329,
+      "reward_std": 0.6390912532806396,
+      "rewards/cosine_scaled_reward/mean": 0.06841734796762466,
+      "rewards/cosine_scaled_reward/std": 0.48315128684043884,
+      "rewards/format_reward/mean": 0.78125,
+      "rewards/format_reward/std": 0.4166666865348816,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.359375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1678.0,
+      "completions/mean_length": 1310.46875,
+      "completions/mean_terminated_length": 896.731689453125,
+      "completions/min_length": 295.0,
+      "completions/min_terminated_length": 295.0,
+      "epoch": 0.072,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28795576095581055,
+      "learning_rate": 8.844151714648274e-07,
+      "loss": -0.0,
+      "num_tokens": 7638170.0,
+      "reward": 0.6488770246505737,
+      "reward_std": 0.7876260876655579,
+      "rewards/cosine_scaled_reward/mean": -0.019311510026454926,
+      "rewards/cosine_scaled_reward/std": 0.4736698865890503,
+      "rewards/format_reward/mean": 0.6875,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1969.0,
+      "completions/mean_length": 1307.625,
+      "completions/mean_terminated_length": 1039.8297119140625,
+      "completions/min_length": 376.0,
+      "completions/min_terminated_length": 376.0,
+      "epoch": 0.07314285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25637197494506836,
+      "learning_rate": 8.791091657286267e-07,
+      "loss": -0.0,
+      "num_tokens": 7732810.0,
+      "reward": 0.8280279636383057,
+      "reward_std": 0.6804471015930176,
+      "rewards/cosine_scaled_reward/mean": 0.015576483681797981,
+      "rewards/cosine_scaled_reward/std": 0.44819310307502747,
+      "rewards/format_reward/mean": 0.796875,
+      "rewards/format_reward/std": 0.40550529956817627,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.359375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1846.0,
+      "completions/mean_length": 1322.125,
+      "completions/mean_terminated_length": 914.9268188476562,
+      "completions/min_length": 297.0,
+      "completions/min_terminated_length": 297.0,
+      "epoch": 0.07428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2944399118423462,
+      "learning_rate": 8.737029101523929e-07,
+      "loss": -0.0,
+      "num_tokens": 7828130.0,
+      "reward": 0.15610456466674805,
+      "reward_std": 0.4606686234474182,
+      "rewards/cosine_scaled_reward/mean": -0.24226020276546478,
+      "rewards/cosine_scaled_reward/std": 0.33131492137908936,
+      "rewards/format_reward/mean": 0.640625,
+      "rewards/format_reward/std": 0.4836103618144989,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1803.0,
+      "completions/mean_length": 1020.21875,
+      "completions/mean_terminated_length": 806.9057006835938,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 0.07542857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.32644009590148926,
+      "learning_rate": 8.681980515339463e-07,
+      "loss": 0.0,
+      "num_tokens": 7903656.0,
+      "reward": 0.7972471714019775,
+      "reward_std": 0.7674820423126221,
+      "rewards/cosine_scaled_reward/mean": -0.031063925474882126,
+      "rewards/cosine_scaled_reward/std": 0.5106223225593567,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1910.0,
+      "completions/mean_length": 1750.859375,
+      "completions/mean_terminated_length": 1142.4285888671875,
+      "completions/min_length": 585.0,
+      "completions/min_terminated_length": 585.0,
+      "epoch": 0.07657142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2270829975605011,
+      "learning_rate": 8.625962667065487e-07,
+      "loss": 0.0,
+      "num_tokens": 8026447.0,
+      "reward": -0.1400720775127411,
+      "reward_std": 0.3325888514518738,
+      "rewards/cosine_scaled_reward/mean": -0.24972353875637054,
+      "rewards/cosine_scaled_reward/std": 0.16404789686203003,
+      "rewards/format_reward/mean": 0.359375,
+      "rewards/format_reward/std": 0.4836103618144989,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1424.0,
+      "completions/mean_length": 769.546875,
+      "completions/mean_terminated_length": 637.2930908203125,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 0.07771428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.37025144696235657,
+      "learning_rate": 8.568992620281243e-07,
+      "loss": -0.0,
+      "num_tokens": 8084954.0,
+      "reward": 0.9792699813842773,
+      "reward_std": 0.804767370223999,
+      "rewards/cosine_scaled_reward/mean": 0.03651002421975136,
+      "rewards/cosine_scaled_reward/std": 0.46041443943977356,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1701.0,
+      "completions/mean_length": 1086.234375,
+      "completions/mean_terminated_length": 886.6226806640625,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 0.07885714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3763800263404846,
+      "learning_rate": 8.511087728614862e-07,
+      "loss": 0.0,
+      "num_tokens": 8164817.0,
+      "reward": 0.35803771018981934,
+      "reward_std": 0.5702667236328125,
+      "rewards/cosine_scaled_reward/mean": -0.24285613000392914,
+      "rewards/cosine_scaled_reward/std": 0.3019825220108032,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1898.0,
+      "completions/mean_length": 1463.375,
+      "completions/mean_terminated_length": 1112.5999755859375,
+      "completions/min_length": 503.0,
+      "completions/min_terminated_length": 503.0,
+      "epoch": 0.08,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24232418835163116,
+      "learning_rate": 8.452265630457282e-07,
+      "loss": -0.0,
+      "num_tokens": 8269929.0,
+      "reward": 0.3703588843345642,
+      "reward_std": 0.7288752794265747,
+      "rewards/cosine_scaled_reward/mean": -0.1351330280303955,
+      "rewards/cosine_scaled_reward/std": 0.3751916289329529,
+      "rewards/format_reward/mean": 0.640625,
+      "rewards/format_reward/std": 0.4836103618144989,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1991.0,
+      "completions/mean_length": 1409.859375,
+      "completions/mean_terminated_length": 973.2368774414062,
+      "completions/min_length": 404.0,
+      "completions/min_terminated_length": 404.0,
+      "epoch": 0.08114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.300010621547699,
+      "learning_rate": 8.392544243589427e-07,
+      "loss": 0.0,
+      "num_tokens": 8370880.0,
+      "reward": 0.5196826457977295,
+      "reward_std": 0.7097917795181274,
+      "rewards/cosine_scaled_reward/mean": -0.044846177101135254,
+      "rewards/cosine_scaled_reward/std": 0.508389949798584,
+      "rewards/format_reward/mean": 0.609375,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1801.0,
+      "completions/mean_length": 1228.046875,
+      "completions/mean_terminated_length": 931.4680786132812,
+      "completions/min_length": 331.0,
+      "completions/min_terminated_length": 331.0,
+      "epoch": 0.08228571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30454304814338684,
+      "learning_rate": 8.331941759724268e-07,
+      "loss": -0.0,
+      "num_tokens": 8459827.0,
+      "reward": 0.41365131735801697,
+      "reward_std": 0.5005639791488647,
+      "rewards/cosine_scaled_reward/mean": -0.1759868562221527,
+      "rewards/cosine_scaled_reward/std": 0.19868774712085724,
+      "rewards/format_reward/mean": 0.765625,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1719.0,
+      "completions/mean_length": 1513.28125,
+      "completions/mean_terminated_length": 1192.4500732421875,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "epoch": 0.08342857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27848970890045166,
+      "learning_rate": 8.270476638965461e-07,
+      "loss": -0.0,
+      "num_tokens": 8567405.0,
+      "reward": 0.09570223093032837,
+      "reward_std": 0.5445049405097961,
+      "rewards/cosine_scaled_reward/mean": -0.2802739143371582,
+      "rewards/cosine_scaled_reward/std": 0.25603488087654114,
+      "rewards/format_reward/mean": 0.65625,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1888.0,
+      "completions/mean_length": 1240.125,
+      "completions/mean_terminated_length": 924.0,
+      "completions/min_length": 530.0,
+      "completions/min_terminated_length": 530.0,
+      "epoch": 0.08457142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2789021134376526,
+      "learning_rate": 8.208167604184217e-07,
+      "loss": 0.0,
+      "num_tokens": 8656701.0,
+      "reward": 0.7823752760887146,
+      "reward_std": 0.6479132175445557,
+      "rewards/cosine_scaled_reward/mean": 0.031812600791454315,
+      "rewards/cosine_scaled_reward/std": 0.5397623181343079,
+      "rewards/format_reward/mean": 0.71875,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2036.0,
+      "completions/mean_length": 1455.953125,
+      "completions/mean_terminated_length": 1186.8409423828125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "epoch": 0.08571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22443196177482605,
+      "learning_rate": 8.145033635316128e-07,
+      "loss": 0.0,
+      "num_tokens": 8760842.0,
+      "reward": 0.8040015697479248,
+      "reward_std": 0.5675323009490967,
+      "rewards/cosine_scaled_reward/mean": 0.027000809088349342,
+      "rewards/cosine_scaled_reward/std": 0.5096040964126587,
+      "rewards/format_reward/mean": 0.75,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1720.0,
+      "completions/mean_length": 1177.859375,
+      "completions/mean_terminated_length": 863.1276245117188,
+      "completions/min_length": 372.0,
+      "completions/min_terminated_length": 372.0,
+      "epoch": 0.08685714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.32647648453712463,
+      "learning_rate": 8.081093963579707e-07,
+      "loss": 0.0,
+      "num_tokens": 8846625.0,
+      "reward": 0.310506671667099,
+      "reward_std": 0.5110941529273987,
+      "rewards/cosine_scaled_reward/mean": -0.2119341641664505,
+      "rewards/cosine_scaled_reward/std": 0.24737994372844696,
+      "rewards/format_reward/mean": 0.734375,
+      "rewards/format_reward/std": 0.44515693187713623,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1787.0,
+      "completions/mean_length": 1263.4375,
+      "completions/mean_terminated_length": 1043.760009765625,
+      "completions/min_length": 501.0,
+      "completions/min_terminated_length": 501.0,
+      "epoch": 0.088,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2545543611049652,
+      "learning_rate": 8.01636806561836e-07,
+      "loss": -0.0,
+      "num_tokens": 8939061.0,
+      "reward": 0.5484907031059265,
+      "reward_std": 0.48998576402664185,
+      "rewards/cosine_scaled_reward/mean": -0.13200464844703674,
+      "rewards/cosine_scaled_reward/std": 0.3430649936199188,
+      "rewards/format_reward/mean": 0.8125,
+      "rewards/format_reward/std": 0.39339789748191833,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1991.0,
+      "completions/mean_length": 1460.78125,
+      "completions/mean_terminated_length": 1059.0,
+      "completions/min_length": 430.0,
+      "completions/min_terminated_length": 430.0,
+      "epoch": 0.08914285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2583931088447571,
+      "learning_rate": 7.950875657567621e-07,
+      "loss": 0.0,
+      "num_tokens": 9043271.0,
+      "reward": 0.6075442433357239,
+      "reward_std": 0.6895643472671509,
+      "rewards/cosine_scaled_reward/mean": -0.0009153857827186584,
+      "rewards/cosine_scaled_reward/std": 0.48922818899154663,
+      "rewards/format_reward/mean": 0.609375,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1956.0,
+      "completions/mean_length": 1054.875,
+      "completions/mean_terminated_length": 892.3635864257812,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 0.09028571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29089078307151794,
+      "learning_rate": 7.884636689049422e-07,
+      "loss": 0.0,
+      "num_tokens": 9120879.0,
+      "reward": 0.6885831356048584,
+      "reward_std": 0.508629322052002,
+      "rewards/cosine_scaled_reward/mean": -0.09320840239524841,
+      "rewards/cosine_scaled_reward/std": 0.38835227489471436,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2037.0,
+      "completions/mean_length": 1399.046875,
+      "completions/mean_terminated_length": 1145.1087646484375,
+      "completions/min_length": 484.0,
+      "completions/min_terminated_length": 484.0,
+      "epoch": 0.09142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27458345890045166,
+      "learning_rate": 7.817671337095244e-07,
+      "loss": 0.0,
+      "num_tokens": 9220810.0,
+      "reward": 0.5549384355545044,
+      "reward_std": 0.7092134952545166,
+      "rewards/cosine_scaled_reward/mean": -0.09753081202507019,
+      "rewards/cosine_scaled_reward/std": 0.4125780463218689,
+      "rewards/format_reward/mean": 0.75,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1833.0,
+      "completions/mean_length": 1084.984375,
+      "completions/mean_terminated_length": 906.6481323242188,
+      "completions/min_length": 274.0,
+      "completions/min_terminated_length": 274.0,
+      "epoch": 0.09257142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.37247684597969055,
+      "learning_rate": 7.75e-07,
+      "loss": -0.0,
+      "num_tokens": 9301521.0,
+      "reward": 0.5357480049133301,
+      "reward_std": 0.5661624670028687,
+      "rewards/cosine_scaled_reward/mean": -0.18525099754333496,
+      "rewards/cosine_scaled_reward/std": 0.3385297954082489,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2046.0,
+      "completions/mean_length": 1260.921875,
+      "completions/mean_terminated_length": 998.5625,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "epoch": 0.09371428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27329322695732117,
+      "learning_rate": 7.681643291108517e-07,
+      "loss": -0.0,
+      "num_tokens": 9392548.0,
+      "reward": 0.9478914737701416,
+      "reward_std": 0.4313860237598419,
+      "rewards/cosine_scaled_reward/mean": 0.09894578158855438,
+      "rewards/cosine_scaled_reward/std": 0.5477120876312256,
+      "rewards/format_reward/mean": 0.75,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2039.0,
+      "completions/mean_length": 1309.671875,
+      "completions/mean_terminated_length": 922.9285888671875,
+      "completions/min_length": 303.0,
+      "completions/min_terminated_length": 303.0,
+      "epoch": 0.09485714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3202998638153076,
+      "learning_rate": 7.612622032536507e-07,
+      "loss": -0.0,
+      "num_tokens": 9487455.0,
+      "reward": 0.5201998949050903,
+      "reward_std": 0.6858996152877808,
+      "rewards/cosine_scaled_reward/mean": -0.09927503764629364,
+      "rewards/cosine_scaled_reward/std": 0.37909674644470215,
+      "rewards/format_reward/mean": 0.71875,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1685.0,
+      "completions/mean_length": 1185.703125,
+      "completions/mean_terminated_length": 965.9019775390625,
+      "completions/min_length": 390.0,
+      "completions/min_terminated_length": 390.0,
+      "epoch": 0.096,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29646041989326477,
+      "learning_rate": 7.54295724882796e-07,
+      "loss": -0.0,
+      "num_tokens": 9574036.0,
+      "reward": 0.6779025793075562,
+      "reward_std": 0.557724118232727,
+      "rewards/cosine_scaled_reward/mean": -0.09073619544506073,
+      "rewards/cosine_scaled_reward/std": 0.3855368196964264,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2021.0,
+      "completions/mean_length": 1297.828125,
+      "completions/mean_terminated_length": 1158.907470703125,
+      "completions/min_length": 601.0,
+      "completions/min_terminated_length": 601.0,
+      "epoch": 0.09714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21307455003261566,
+      "learning_rate": 7.472670160550848e-07,
+      "loss": 0.0,
+      "num_tokens": 9667417.0,
+      "reward": 0.5093189477920532,
+      "reward_std": 0.6006681323051453,
+      "rewards/cosine_scaled_reward/mean": -0.1672155261039734,
+      "rewards/cosine_scaled_reward/std": 0.34896284341812134,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1859.0,
+      "completions/mean_length": 1348.90625,
+      "completions/mean_terminated_length": 1096.04248046875,
+      "completions/min_length": 501.0,
+      "completions/min_terminated_length": 501.0,
+      "epoch": 0.09828571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2883393168449402,
+      "learning_rate": 7.401782177833147e-07,
+      "loss": -0.0,
+      "num_tokens": 9764603.0,
+      "reward": 0.8025823831558228,
+      "reward_std": 0.547119677066803,
+      "rewards/cosine_scaled_reward/mean": 0.01847870647907257,
+      "rewards/cosine_scaled_reward/std": 0.4346420168876648,
+      "rewards/format_reward/mean": 0.765625,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1782.0,
+      "completions/mean_length": 1086.96875,
+      "completions/mean_terminated_length": 909.0,
+      "completions/min_length": 350.0,
+      "completions/min_terminated_length": 350.0,
+      "epoch": 0.09942857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31888866424560547,
+      "learning_rate": 7.330314893841101e-07,
+      "loss": -0.0,
+      "num_tokens": 9844289.0,
+      "reward": 0.5533354878425598,
+      "reward_std": 0.5319498777389526,
+      "rewards/cosine_scaled_reward/mean": -0.1530197560787201,
+      "rewards/cosine_scaled_reward/std": 0.2434682846069336,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1984.0,
+      "completions/mean_length": 954.921875,
+      "completions/mean_terminated_length": 919.6612548828125,
+      "completions/min_length": 114.0,
+      "completions/min_terminated_length": 114.0,
+      "epoch": 0.10057142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3025936484336853,
+      "learning_rate": 7.258290078201731e-07,
+      "loss": -0.0,
+      "num_tokens": 9915916.0,
+      "reward": 1.2692296504974365,
+      "reward_std": 0.5115163326263428,
+      "rewards/cosine_scaled_reward/mean": 0.13461479544639587,
+      "rewards/cosine_scaled_reward/std": 0.506001353263855,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1926.0,
+      "completions/mean_length": 1351.8125,
+      "completions/mean_terminated_length": 1174.35302734375,
+      "completions/min_length": 650.0,
+      "completions/min_terminated_length": 650.0,
+      "epoch": 0.10171428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23423585295677185,
+      "learning_rate": 7.185729670371604e-07,
+      "loss": -0.0,
+      "num_tokens": 10013432.0,
+      "reward": 0.724889874458313,
+      "reward_std": 0.7425336837768555,
+      "rewards/cosine_scaled_reward/mean": -0.0828675627708435,
+      "rewards/cosine_scaled_reward/std": 0.3893774449825287,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1906.0,
+      "completions/mean_length": 1153.28125,
+      "completions/mean_terminated_length": 1025.46435546875,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "epoch": 0.10285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3860023021697998,
+      "learning_rate": 7.11265577295385e-07,
+      "loss": -0.0,
+      "num_tokens": 10097242.0,
+      "reward": 0.5000253915786743,
+      "reward_std": 0.5103108286857605,
+      "rewards/cosine_scaled_reward/mean": -0.18748730421066284,
+      "rewards/cosine_scaled_reward/std": 0.2787182629108429,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.328125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2032.0,
+      "completions/mean_length": 1455.484375,
+      "completions/mean_terminated_length": 1166.1163330078125,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "epoch": 0.104,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2551063895225525,
+      "learning_rate": 7.039090644965509e-07,
+      "loss": 0.0,
+      "num_tokens": 10200961.0,
+      "reward": 0.4053259789943695,
+      "reward_std": 0.663999617099762,
+      "rewards/cosine_scaled_reward/mean": -0.18796202540397644,
+      "rewards/cosine_scaled_reward/std": 0.35777655243873596,
+      "rewards/format_reward/mean": 0.78125,
+      "rewards/format_reward/std": 0.4166666865348816,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2004.0,
+      "completions/mean_length": 1176.953125,
+      "completions/mean_terminated_length": 1015.6481323242188,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "epoch": 0.10514285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27449366450309753,
+      "learning_rate": 6.965056695057204e-07,
+      "loss": -0.0,
+      "num_tokens": 10286278.0,
+      "reward": 0.5743436217308044,
+      "reward_std": 0.6229422092437744,
+      "rewards/cosine_scaled_reward/mean": -0.15032817423343658,
+      "rewards/cosine_scaled_reward/std": 0.2899566888809204,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2027.0,
+      "completions/mean_length": 1434.875,
+      "completions/mean_terminated_length": 1156.181884765625,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "epoch": 0.10628571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2839376926422119,
+      "learning_rate": 6.890576474687263e-07,
+      "loss": 0.0,
+      "num_tokens": 10389454.0,
+      "reward": 0.30658647418022156,
+      "reward_std": 0.5343226194381714,
+      "rewards/cosine_scaled_reward/mean": -0.22951926290988922,
+      "rewards/cosine_scaled_reward/std": 0.2324177473783493,
+      "rewards/format_reward/mean": 0.765625,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1684.0,
+      "completions/mean_length": 1242.390625,
+      "completions/mean_terminated_length": 927.1522216796875,
+      "completions/min_length": 508.0,
+      "completions/min_terminated_length": 508.0,
+      "epoch": 0.10742857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2985072433948517,
+      "learning_rate": 6.815672671252315e-07,
+      "loss": 0.0,
+      "num_tokens": 10478735.0,
+      "reward": 0.6593698263168335,
+      "reward_std": 0.5845412015914917,
+      "rewards/cosine_scaled_reward/mean": -0.02969011664390564,
+      "rewards/cosine_scaled_reward/std": 0.47056320309638977,
+      "rewards/format_reward/mean": 0.71875,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1931.0,
+      "completions/mean_length": 1203.265625,
+      "completions/mean_terminated_length": 1082.58935546875,
+      "completions/min_length": 573.0,
+      "completions/min_terminated_length": 573.0,
+      "epoch": 0.10857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2689598798751831,
+      "learning_rate": 6.740368101176495e-07,
+      "loss": 0.0,
+      "num_tokens": 10566272.0,
+      "reward": 0.4301251173019409,
+      "reward_std": 0.4795047640800476,
+      "rewards/cosine_scaled_reward/mean": -0.22243742644786835,
+      "rewards/cosine_scaled_reward/std": 0.2575407326221466,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1827.0,
+      "completions/mean_length": 1205.5625,
+      "completions/mean_terminated_length": 990.8235473632812,
+      "completions/min_length": 441.0,
+      "completions/min_terminated_length": 441.0,
+      "epoch": 0.10971428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30502915382385254,
+      "learning_rate": 6.664685702961344e-07,
+      "loss": -0.0,
+      "num_tokens": 10654564.0,
+      "reward": 0.896080493927002,
+      "reward_std": 0.6987663507461548,
+      "rewards/cosine_scaled_reward/mean": 0.02616523765027523,
+      "rewards/cosine_scaled_reward/std": 0.460237056016922,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1777.0,
+      "completions/mean_length": 1170.390625,
+      "completions/mean_terminated_length": 988.2453002929688,
+      "completions/min_length": 430.0,
+      "completions/min_terminated_length": 430.0,
+      "epoch": 0.11085714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3103901743888855,
+      "learning_rate": 6.588648530198504e-07,
+      "loss": -0.0,
+      "num_tokens": 10739733.0,
+      "reward": 0.6633297204971313,
+      "reward_std": 0.609075665473938,
+      "rewards/cosine_scaled_reward/mean": -0.12927262485027313,
+      "rewards/cosine_scaled_reward/std": 0.4114542305469513,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1817.0,
+      "completions/mean_length": 1136.5625,
+      "completions/mean_terminated_length": 947.396240234375,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "epoch": 0.112,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2510873079299927,
+      "learning_rate": 6.512279744547392e-07,
+      "loss": 0.0,
+      "num_tokens": 10823537.0,
+      "reward": 0.6613268256187439,
+      "reward_std": 0.4785424768924713,
+      "rewards/cosine_scaled_reward/mean": -0.09902409464120865,
+      "rewards/cosine_scaled_reward/std": 0.4345317482948303,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2031.0,
+      "completions/mean_length": 1171.8125,
+      "completions/mean_terminated_length": 1081.17236328125,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 0.11314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.281054824590683,
+      "learning_rate": 6.435602608679916e-07,
+      "loss": -0.0,
+      "num_tokens": 10909701.0,
+      "reward": 1.0416245460510254,
+      "reward_std": 0.6949809789657593,
+      "rewards/cosine_scaled_reward/mean": 0.0520622618496418,
+      "rewards/cosine_scaled_reward/std": 0.508481502532959,
+      "rewards/format_reward/mean": 0.9375,
+      "rewards/format_reward/std": 0.24397502839565277,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1935.0,
+      "completions/mean_length": 1120.8125,
+      "completions/mean_terminated_length": 1024.8966064453125,
+      "completions/min_length": 410.0,
+      "completions/min_terminated_length": 410.0,
+      "epoch": 0.11428571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2910788655281067,
+      "learning_rate": 6.358640479194451e-07,
+      "loss": 0.0,
+      "num_tokens": 10991145.0,
+      "reward": 1.2036188840866089,
+      "reward_std": 0.8533884286880493,
+      "rewards/cosine_scaled_reward/mean": 0.14087192714214325,
+      "rewards/cosine_scaled_reward/std": 0.5375887751579285,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1978.0,
+      "completions/mean_length": 1076.953125,
+      "completions/mean_terminated_length": 1029.1966552734375,
+      "completions/min_length": 423.0,
+      "completions/min_terminated_length": 423.0,
+      "epoch": 0.11542857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33955609798431396,
+      "learning_rate": 6.281416799501187e-07,
+      "loss": 0.0,
+      "num_tokens": 11071502.0,
+      "reward": 0.7810705900192261,
+      "reward_std": 0.5973731279373169,
+      "rewards/cosine_scaled_reward/mean": -0.10165221989154816,
+      "rewards/cosine_scaled_reward/std": 0.4130260646343231,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1959.0,
+      "completions/mean_length": 1092.078125,
+      "completions/mean_terminated_length": 935.654541015625,
+      "completions/min_length": 370.0,
+      "completions/min_terminated_length": 370.0,
+      "epoch": 0.11657142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34537607431411743,
+      "learning_rate": 6.203955092681039e-07,
+      "loss": 0.0,
+      "num_tokens": 11151547.0,
+      "reward": 0.6441041231155396,
+      "reward_std": 0.53089839220047,
+      "rewards/cosine_scaled_reward/mean": -0.10763543844223022,
+      "rewards/cosine_scaled_reward/std": 0.39948928356170654,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2039.0,
+      "completions/mean_length": 1120.625,
+      "completions/mean_terminated_length": 1006.7368774414062,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 0.11771428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.343980997800827,
+      "learning_rate": 6.126278954320294e-07,
+      "loss": 0.0,
+      "num_tokens": 11233619.0,
+      "reward": 0.6925251483917236,
+      "reward_std": 0.5938367247581482,
+      "rewards/cosine_scaled_reward/mean": -0.13029994070529938,
+      "rewards/cosine_scaled_reward/std": 0.37749138474464417,
+      "rewards/format_reward/mean": 0.953125,
+      "rewards/format_reward/std": 0.21304203569889069,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1913.0,
+      "completions/mean_length": 1120.359375,
+      "completions/mean_terminated_length": 948.5740966796875,
+      "completions/min_length": 345.0,
+      "completions/min_terminated_length": 345.0,
+      "epoch": 0.11885714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30854102969169617,
+      "learning_rate": 6.048412045323164e-07,
+      "loss": -0.0,
+      "num_tokens": 11315786.0,
+      "reward": 0.560060977935791,
+      "reward_std": 0.5216183662414551,
+      "rewards/cosine_scaled_reward/mean": -0.1418444812297821,
+      "rewards/cosine_scaled_reward/std": 0.33836889266967773,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1897.0,
+      "completions/mean_length": 1158.421875,
+      "completions/mean_terminated_length": 953.1346435546875,
+      "completions/min_length": 503.0,
+      "completions/min_terminated_length": 503.0,
+      "epoch": 0.12,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29531243443489075,
+      "learning_rate": 5.97037808470444e-07,
+      "loss": -0.0,
+      "num_tokens": 11401213.0,
+      "reward": 1.0410652160644531,
+      "reward_std": 0.7858219742774963,
+      "rewards/cosine_scaled_reward/mean": 0.09084508568048477,
+      "rewards/cosine_scaled_reward/std": 0.5061684250831604,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1867.0,
+      "completions/mean_length": 1045.859375,
+      "completions/mean_terminated_length": 837.867919921875,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "epoch": 0.12114285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26259294152259827,
+      "learning_rate": 5.892200842364462e-07,
+      "loss": -0.0,
+      "num_tokens": 11478980.0,
+      "reward": 1.0545225143432617,
+      "reward_std": 0.7633667588233948,
+      "rewards/cosine_scaled_reward/mean": 0.07413630187511444,
+      "rewards/cosine_scaled_reward/std": 0.48842984437942505,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1999.0,
+      "completions/mean_length": 1101.234375,
+      "completions/mean_terminated_length": 946.30908203125,
+      "completions/min_length": 346.0,
+      "completions/min_terminated_length": 346.0,
+      "epoch": 0.12228571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3363504409790039,
+      "learning_rate": 5.813904131848564e-07,
+      "loss": 0.0,
+      "num_tokens": 11560611.0,
+      "reward": 0.648673415184021,
+      "reward_std": 0.6051540970802307,
+      "rewards/cosine_scaled_reward/mean": -0.11316327750682831,
+      "rewards/cosine_scaled_reward/std": 0.37149766087532043,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1857.0,
+      "completions/mean_length": 1225.28125,
+      "completions/mean_terminated_length": 1054.5283203125,
+      "completions/min_length": 515.0,
+      "completions/min_terminated_length": 515.0,
+      "epoch": 0.12342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2867675721645355,
+      "learning_rate": 5.735511803093248e-07,
+      "loss": 0.0,
+      "num_tokens": 11649389.0,
+      "reward": 0.560509204864502,
+      "reward_std": 0.6691359877586365,
+      "rewards/cosine_scaled_reward/mean": -0.14943289756774902,
+      "rewards/cosine_scaled_reward/std": 0.4461749494075775,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2001.0,
+      "completions/mean_length": 1227.203125,
+      "completions/mean_terminated_length": 1056.84912109375,
+      "completions/min_length": 513.0,
+      "completions/min_terminated_length": 513.0,
+      "epoch": 0.12457142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2772690951824188,
+      "learning_rate": 5.657047735161255e-07,
+      "loss": -0.0,
+      "num_tokens": 11739178.0,
+      "reward": 0.6980891227722168,
+      "reward_std": 0.624833345413208,
+      "rewards/cosine_scaled_reward/mean": -0.0650179386138916,
+      "rewards/cosine_scaled_reward/std": 0.41062912344932556,
+      "rewards/format_reward/mean": 0.828125,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1727.0,
+      "completions/mean_length": 1145.0,
+      "completions/mean_terminated_length": 914.8235473632812,
+      "completions/min_length": 414.0,
+      "completions/min_terminated_length": 414.0,
+      "epoch": 0.12571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3468596637248993,
+      "learning_rate": 5.578535828967777e-07,
+      "loss": -0.0,
+      "num_tokens": 11823234.0,
+      "reward": 0.6972323656082153,
+      "reward_std": 0.5477026104927063,
+      "rewards/cosine_scaled_reward/mean": -0.08888379484415054,
+      "rewards/cosine_scaled_reward/std": 0.3565239906311035,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1969.0,
+      "completions/max_terminated_length": 1969.0,
+      "completions/mean_length": 977.046875,
+      "completions/mean_terminated_length": 977.046875,
+      "completions/min_length": 332.0,
+      "completions/min_terminated_length": 332.0,
+      "epoch": 0.12685714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3180137574672699,
+      "learning_rate": 5.5e-07,
+      "loss": 0.0,
+      "num_tokens": 11895885.0,
+      "reward": 0.8744360208511353,
+      "reward_std": 0.5815237164497375,
+      "rewards/cosine_scaled_reward/mean": -0.06278196722269058,
+      "rewards/cosine_scaled_reward/std": 0.37791064381599426,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1954.0,
+      "completions/mean_length": 1269.421875,
+      "completions/mean_terminated_length": 1089.75,
+      "completions/min_length": 605.0,
+      "completions/min_terminated_length": 605.0,
+      "epoch": 0.128,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2817465364933014,
+      "learning_rate": 5.421464171032224e-07,
+      "loss": -0.0,
+      "num_tokens": 11988224.0,
+      "reward": 0.9151681065559387,
+      "reward_std": 0.594943642616272,
+      "rewards/cosine_scaled_reward/mean": 0.02789657562971115,
+      "rewards/cosine_scaled_reward/std": 0.4965399205684662,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1910.0,
+      "completions/max_terminated_length": 1910.0,
+      "completions/mean_length": 934.578125,
+      "completions/mean_terminated_length": 934.578125,
+      "completions/min_length": 326.0,
+      "completions/min_terminated_length": 326.0,
+      "epoch": 0.12914285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3341560959815979,
+      "learning_rate": 5.342952264838747e-07,
+      "loss": -0.0,
+      "num_tokens": 12058333.0,
+      "reward": 1.0256879329681396,
+      "reward_std": 0.717230498790741,
+      "rewards/cosine_scaled_reward/mean": 0.02065650373697281,
+      "rewards/cosine_scaled_reward/std": 0.4963410794734955,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1911.0,
+      "completions/mean_length": 1055.21875,
+      "completions/mean_terminated_length": 971.0847778320312,
+      "completions/min_length": 124.0,
+      "completions/min_terminated_length": 124.0,
+      "epoch": 0.13028571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3800676763057709,
+      "learning_rate": 5.264488196906752e-07,
+      "loss": -0.0,
+      "num_tokens": 12135715.0,
+      "reward": 0.649993896484375,
+      "reward_std": 0.5865596532821655,
+      "rewards/cosine_scaled_reward/mean": -0.1750030517578125,
+      "rewards/cosine_scaled_reward/std": 0.3388007879257202,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1981.0,
+      "completions/mean_length": 1169.671875,
+      "completions/mean_terminated_length": 987.3773803710938,
+      "completions/min_length": 324.0,
+      "completions/min_terminated_length": 324.0,
+      "epoch": 0.13142857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3112519085407257,
+      "learning_rate": 5.186095868151436e-07,
+      "loss": 0.0,
+      "num_tokens": 12221790.0,
+      "reward": 0.7184536457061768,
+      "reward_std": 0.44992831349372864,
+      "rewards/cosine_scaled_reward/mean": -0.06264819949865341,
+      "rewards/cosine_scaled_reward/std": 0.44565486907958984,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1936.0,
+      "completions/mean_length": 1224.890625,
+      "completions/mean_terminated_length": 1072.4630126953125,
+      "completions/min_length": 393.0,
+      "completions/min_terminated_length": 393.0,
+      "epoch": 0.13257142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2884223461151123,
+      "learning_rate": 5.107799157635538e-07,
+      "loss": 0.0,
+      "num_tokens": 12311567.0,
+      "reward": 0.8372049927711487,
+      "reward_std": 0.608986496925354,
+      "rewards/cosine_scaled_reward/mean": -0.026710007339715958,
+      "rewards/cosine_scaled_reward/std": 0.4437602162361145,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1927.0,
+      "completions/mean_length": 1078.65625,
+      "completions/mean_terminated_length": 1030.9835205078125,
+      "completions/min_length": 494.0,
+      "completions/min_terminated_length": 494.0,
+      "epoch": 0.1337142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3016076385974884,
+      "learning_rate": 5.02962191529556e-07,
+      "loss": -0.0,
+      "num_tokens": 12391625.0,
+      "reward": 0.8182538747787476,
+      "reward_std": 0.6463132500648499,
+      "rewards/cosine_scaled_reward/mean": -0.09087307006120682,
+      "rewards/cosine_scaled_reward/std": 0.3895137310028076,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1946.0,
+      "completions/mean_length": 1226.046875,
+      "completions/mean_terminated_length": 952.0625,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "epoch": 0.13485714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2991194427013397,
+      "learning_rate": 4.951587954676837e-07,
+      "loss": 0.0,
+      "num_tokens": 12480628.0,
+      "reward": 0.6370267868041992,
+      "reward_std": 0.7525250911712646,
+      "rewards/cosine_scaled_reward/mean": -0.056486621499061584,
+      "rewards/cosine_scaled_reward/std": 0.44576171040534973,
+      "rewards/format_reward/mean": 0.75,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2020.0,
+      "completions/mean_length": 1038.96875,
+      "completions/mean_terminated_length": 894.8214721679688,
+      "completions/min_length": 149.0,
+      "completions/min_terminated_length": 149.0,
+      "epoch": 0.136,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.4483291506767273,
+      "learning_rate": 4.873721045679706e-07,
+      "loss": 0.0,
+      "num_tokens": 12557530.0,
+      "reward": 0.9855979084968567,
+      "reward_std": 0.6055079698562622,
+      "rewards/cosine_scaled_reward/mean": 0.04748644679784775,
+      "rewards/cosine_scaled_reward/std": 0.47108832001686096,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1947.0,
+      "completions/mean_length": 856.578125,
+      "completions/mean_terminated_length": 818.1451416015625,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 0.13714285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3406151831150055,
+      "learning_rate": 4.79604490731896e-07,
+      "loss": -0.0,
+      "num_tokens": 12622807.0,
+      "reward": 0.7979192733764648,
+      "reward_std": 0.6180044412612915,
+      "rewards/cosine_scaled_reward/mean": -0.10104038566350937,
+      "rewards/cosine_scaled_reward/std": 0.44317325949668884,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1764.0,
+      "completions/mean_length": 726.34375,
+      "completions/mean_terminated_length": 683.7096557617188,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 0.1382857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.4178949296474457,
+      "learning_rate": 4.7185832004988133e-07,
+      "loss": 0.0,
+      "num_tokens": 12678989.0,
+      "reward": 1.161607265472412,
+      "reward_std": 0.6393733024597168,
+      "rewards/cosine_scaled_reward/mean": 0.08080361783504486,
+      "rewards/cosine_scaled_reward/std": 0.5313310027122498,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2003.0,
+      "completions/mean_length": 1133.796875,
+      "completions/mean_terminated_length": 1039.22412109375,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 0.13942857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3333284258842468,
+      "learning_rate": 4.641359520805548e-07,
+      "loss": 0.0,
+      "num_tokens": 12763112.0,
+      "reward": 0.9356573820114136,
+      "reward_std": 0.6247758269309998,
+      "rewards/cosine_scaled_reward/mean": -0.02435879409313202,
+      "rewards/cosine_scaled_reward/std": 0.4759780466556549,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1993.0,
+      "completions/mean_length": 1031.296875,
+      "completions/mean_terminated_length": 981.2950439453125,
+      "completions/min_length": 447.0,
+      "completions/min_terminated_length": 447.0,
+      "epoch": 0.14057142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29939791560173035,
+      "learning_rate": 4.5643973913200837e-07,
+      "loss": -0.0,
+      "num_tokens": 12839347.0,
+      "reward": 0.7725162506103516,
+      "reward_std": 0.5560778379440308,
+      "rewards/cosine_scaled_reward/mean": -0.09811685979366302,
+      "rewards/cosine_scaled_reward/std": 0.3822804391384125,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2011.0,
+      "completions/mean_length": 979.234375,
+      "completions/mean_terminated_length": 944.758056640625,
+      "completions/min_length": 274.0,
+      "completions/min_terminated_length": 274.0,
+      "epoch": 0.1417142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34992095828056335,
+      "learning_rate": 4.4877202554526084e-07,
+      "loss": 0.0,
+      "num_tokens": 12912970.0,
+      "reward": 1.085427165031433,
+      "reward_std": 0.6837464570999146,
+      "rewards/cosine_scaled_reward/mean": 0.05052608996629715,
+      "rewards/cosine_scaled_reward/std": 0.4791998267173767,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1993.0,
+      "completions/mean_length": 1076.40625,
+      "completions/mean_terminated_length": 994.0678100585938,
+      "completions/min_length": 309.0,
+      "completions/min_terminated_length": 309.0,
+      "epoch": 0.14285714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27060386538505554,
+      "learning_rate": 4.4113514698014953e-07,
+      "loss": -0.0,
+      "num_tokens": 12992788.0,
+      "reward": 1.0397578477859497,
+      "reward_std": 0.43823006749153137,
+      "rewards/cosine_scaled_reward/mean": 0.019878946244716644,
+      "rewards/cosine_scaled_reward/std": 0.46214956045150757,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1856.0,
+      "completions/mean_length": 1071.53125,
+      "completions/mean_terminated_length": 1006.4334106445312,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "epoch": 0.144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2776121497154236,
+      "learning_rate": 4.3353142970386557e-07,
+      "loss": 0.0,
+      "num_tokens": 13072662.0,
+      "reward": 1.0028693675994873,
+      "reward_std": 0.6879971027374268,
+      "rewards/cosine_scaled_reward/mean": 0.0014346465468406677,
+      "rewards/cosine_scaled_reward/std": 0.42488595843315125,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1868.0,
+      "completions/mean_length": 1180.484375,
+      "completions/mean_terminated_length": 1056.5535888671875,
+      "completions/min_length": 285.0,
+      "completions/min_terminated_length": 285.0,
+      "epoch": 0.14514285714285713,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2829054594039917,
+      "learning_rate": 4.2596318988235037e-07,
+      "loss": -0.0,
+      "num_tokens": 13159309.0,
+      "reward": 0.6576684713363647,
+      "reward_std": 0.66895592212677,
+      "rewards/cosine_scaled_reward/mean": -0.15554077923297882,
+      "rewards/cosine_scaled_reward/std": 0.3959099054336548,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1869.0,
+      "completions/mean_length": 1053.328125,
+      "completions/mean_terminated_length": 950.4310302734375,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "epoch": 0.1462857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29738253355026245,
+      "learning_rate": 4.1843273287476854e-07,
+      "loss": -0.0,
+      "num_tokens": 13237074.0,
+      "reward": 0.8851851224899292,
+      "reward_std": 0.7390589118003845,
+      "rewards/cosine_scaled_reward/mean": -0.041782446205616,
+      "rewards/cosine_scaled_reward/std": 0.46901625394821167,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1927.0,
+      "completions/mean_length": 1228.484375,
+      "completions/mean_terminated_length": 1111.4107666015625,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "epoch": 0.14742857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25943535566329956,
+      "learning_rate": 4.1094235253127374e-07,
+      "loss": -0.0,
+      "num_tokens": 13326401.0,
+      "reward": 0.9628820419311523,
+      "reward_std": 0.6490253210067749,
+      "rewards/cosine_scaled_reward/mean": 0.004878522828221321,
+      "rewards/cosine_scaled_reward/std": 0.45456331968307495,
+      "rewards/format_reward/mean": 0.953125,
+      "rewards/format_reward/std": 0.21304203569889069,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1967.0,
+      "completions/mean_length": 1089.578125,
+      "completions/mean_terminated_length": 952.6607666015625,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "epoch": 0.14857142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3009719252586365,
+      "learning_rate": 4.034943304942796e-07,
+      "loss": 0.0,
+      "num_tokens": 13406638.0,
+      "reward": 0.5984547138214111,
+      "reward_std": 0.7008002996444702,
+      "rewards/cosine_scaled_reward/mean": -0.14608514308929443,
+      "rewards/cosine_scaled_reward/std": 0.37894922494888306,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1651.0,
+      "completions/mean_length": 1058.03125,
+      "completions/mean_terminated_length": 916.607177734375,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "epoch": 0.14971428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.306725412607193,
+      "learning_rate": 3.9609093550344907e-07,
+      "loss": 0.0,
+      "num_tokens": 13484088.0,
+      "reward": 1.0469268560409546,
+      "reward_std": 0.6023457050323486,
+      "rewards/cosine_scaled_reward/mean": 0.0703384131193161,
+      "rewards/cosine_scaled_reward/std": 0.47298464179039,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1773.0,
+      "completions/mean_length": 1342.78125,
+      "completions/mean_terminated_length": 919.6500244140625,
+      "completions/min_length": 366.0,
+      "completions/min_terminated_length": 366.0,
+      "epoch": 0.15085714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3032574951648712,
+      "learning_rate": 3.8873442270461485e-07,
+      "loss": -0.0,
+      "num_tokens": 13581090.0,
+      "reward": 0.4643245339393616,
+      "reward_std": 0.7533800601959229,
+      "rewards/cosine_scaled_reward/mean": -0.06471271812915802,
+      "rewards/cosine_scaled_reward/std": 0.4610835611820221,
+      "rewards/format_reward/mean": 0.59375,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1974.0,
+      "completions/mean_length": 1144.921875,
+      "completions/mean_terminated_length": 957.4906005859375,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "epoch": 0.152,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.32285141944885254,
+      "learning_rate": 3.8142703296283953e-07,
+      "loss": 0.0,
+      "num_tokens": 13665589.0,
+      "reward": 0.5014957189559937,
+      "reward_std": 0.5352932214736938,
+      "rewards/cosine_scaled_reward/mean": -0.17112717032432556,
+      "rewards/cosine_scaled_reward/std": 0.28127768635749817,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1965.0,
+      "completions/mean_length": 975.53125,
+      "completions/mean_terminated_length": 958.5079956054688,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 0.15314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.40716752409935,
+      "learning_rate": 3.7417099217982686e-07,
+      "loss": -0.0,
+      "num_tokens": 13738591.0,
+      "reward": 1.1759617328643799,
+      "reward_std": 0.4804629683494568,
+      "rewards/cosine_scaled_reward/mean": 0.08798093348741531,
+      "rewards/cosine_scaled_reward/std": 0.5343761444091797,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1686.0,
+      "completions/max_terminated_length": 1686.0,
+      "completions/mean_length": 758.515625,
+      "completions/mean_terminated_length": 758.515625,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 0.15428571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.42696353793144226,
+      "learning_rate": 3.6696851061588994e-07,
+      "loss": -0.0,
+      "num_tokens": 13797608.0,
+      "reward": 1.3851683139801025,
+      "reward_std": 0.5234883427619934,
+      "rewards/cosine_scaled_reward/mean": 0.19258417189121246,
+      "rewards/cosine_scaled_reward/std": 0.49346473813056946,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2007.0,
+      "completions/mean_length": 1169.875,
+      "completions/mean_terminated_length": 1095.4576416015625,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "epoch": 0.15542857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28027620911598206,
+      "learning_rate": 3.5982178221668533e-07,
+      "loss": -0.0,
+      "num_tokens": 13883152.0,
+      "reward": 1.0174503326416016,
+      "reward_std": 0.5889347791671753,
+      "rewards/cosine_scaled_reward/mean": 0.016537662595510483,
+      "rewards/cosine_scaled_reward/std": 0.4763922095298767,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1814.0,
+      "completions/mean_length": 1105.3125,
+      "completions/mean_terminated_length": 1042.4666748046875,
+      "completions/min_length": 446.0,
+      "completions/min_terminated_length": 446.0,
+      "epoch": 0.15657142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3002299666404724,
+      "learning_rate": 3.5273298394491515e-07,
+      "loss": 0.0,
+      "num_tokens": 13964500.0,
+      "reward": 0.841381847858429,
+      "reward_std": 0.6354345083236694,
+      "rewards/cosine_scaled_reward/mean": -0.07149658352136612,
+      "rewards/cosine_scaled_reward/std": 0.4138363003730774,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1983.0,
+      "completions/mean_length": 1125.484375,
+      "completions/mean_terminated_length": 974.5272216796875,
+      "completions/min_length": 361.0,
+      "completions/min_terminated_length": 361.0,
+      "epoch": 0.15771428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28766506910324097,
+      "learning_rate": 3.45704275117204e-07,
+      "loss": -0.0,
+      "num_tokens": 14047843.0,
+      "reward": 0.8758631944656372,
+      "reward_std": 0.7212573289871216,
+      "rewards/cosine_scaled_reward/mean": -0.05425591766834259,
+      "rewards/cosine_scaled_reward/std": 0.4783853590488434,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2023.0,
+      "completions/mean_length": 1216.171875,
+      "completions/mean_terminated_length": 1160.7166748046875,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "epoch": 0.15885714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2882857024669647,
+      "learning_rate": 3.387377967463493e-07,
+      "loss": -0.0,
+      "num_tokens": 14136318.0,
+      "reward": 0.7189284563064575,
+      "reward_std": 0.4593912959098816,
+      "rewards/cosine_scaled_reward/mean": -0.13272328674793243,
+      "rewards/cosine_scaled_reward/std": 0.33584704995155334,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2017.0,
+      "completions/mean_length": 1142.140625,
+      "completions/mean_terminated_length": 1012.732177734375,
+      "completions/min_length": 389.0,
+      "completions/min_terminated_length": 389.0,
+      "epoch": 0.16,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.3000667095184326,
+      "learning_rate": 3.3183567088914833e-07,
+      "loss": 0.0,
+      "num_tokens": 14219639.0,
+      "reward": 0.8278639316558838,
+      "reward_std": 0.46724599599838257,
+      "rewards/cosine_scaled_reward/mean": -0.03919300064444542,
+      "rewards/cosine_scaled_reward/std": 0.4650508463382721,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1919.0,
+      "completions/mean_length": 1025.421875,
+      "completions/mean_terminated_length": 975.131103515625,
+      "completions/min_length": 394.0,
+      "completions/min_terminated_length": 394.0,
+      "epoch": 0.16114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3207882046699524,
+      "learning_rate": 3.250000000000001e-07,
+      "loss": 0.0,
+      "num_tokens": 14295826.0,
+      "reward": 0.8871637582778931,
+      "reward_std": 0.6538586616516113,
+      "rewards/cosine_scaled_reward/mean": -0.04079316183924675,
+      "rewards/cosine_scaled_reward/std": 0.43451616168022156,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1984.0,
+      "completions/mean_length": 1233.90625,
+      "completions/mean_terminated_length": 1149.689697265625,
+      "completions/min_length": 278.0,
+      "completions/min_terminated_length": 278.0,
+      "epoch": 0.16228571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3009903132915497,
+      "learning_rate": 3.182328662904756e-07,
+      "loss": 0.0,
+      "num_tokens": 14385300.0,
+      "reward": 0.8573208451271057,
+      "reward_std": 0.6099269390106201,
+      "rewards/cosine_scaled_reward/mean": -0.055714573711156845,
+      "rewards/cosine_scaled_reward/std": 0.43728360533714294,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1946.0,
+      "completions/mean_length": 1136.078125,
+      "completions/mean_terminated_length": 1005.8035888671875,
+      "completions/min_length": 415.0,
+      "completions/min_terminated_length": 415.0,
+      "epoch": 0.16342857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31794917583465576,
+      "learning_rate": 3.115363310950578e-07,
+      "loss": 0.0,
+      "num_tokens": 14468825.0,
+      "reward": 0.6553314924240112,
+      "reward_std": 0.6344339847564697,
+      "rewards/cosine_scaled_reward/mean": -0.11764675378799438,
+      "rewards/cosine_scaled_reward/std": 0.3099633455276489,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1947.0,
+      "completions/mean_length": 1220.6875,
+      "completions/mean_terminated_length": 1029.769287109375,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 0.16457142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3814108967781067,
+      "learning_rate": 3.0491243424323783e-07,
+      "loss": 0.0,
+      "num_tokens": 14558437.0,
+      "reward": 0.7285318970680237,
+      "reward_std": 0.8925961256027222,
+      "rewards/cosine_scaled_reward/mean": -0.05760904401540756,
+      "rewards/cosine_scaled_reward/std": 0.492266446352005,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1880.0,
+      "completions/mean_length": 969.796875,
+      "completions/mean_terminated_length": 916.7704467773438,
+      "completions/min_length": 275.0,
+      "completions/min_terminated_length": 275.0,
+      "epoch": 0.1657142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3201180398464203,
+      "learning_rate": 2.9836319343816397e-07,
+      "loss": -0.0,
+      "num_tokens": 14630448.0,
+      "reward": 0.8149441480636597,
+      "reward_std": 0.5824600458145142,
+      "rewards/cosine_scaled_reward/mean": -0.08471541851758957,
+      "rewards/cosine_scaled_reward/std": 0.475755512714386,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1943.0,
+      "completions/mean_length": 1034.484375,
+      "completions/mean_terminated_length": 966.9166870117188,
+      "completions/min_length": 482.0,
+      "completions/min_terminated_length": 482.0,
+      "epoch": 0.16685714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28184273838996887,
+      "learning_rate": 2.918906036420294e-07,
+      "loss": -0.0,
+      "num_tokens": 14707271.0,
+      "reward": 0.8387603759765625,
+      "reward_std": 0.5346506237983704,
+      "rewards/cosine_scaled_reward/mean": -0.07280732691287994,
+      "rewards/cosine_scaled_reward/std": 0.43024110794067383,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1980.0,
+      "completions/mean_length": 1249.984375,
+      "completions/mean_terminated_length": 1046.568603515625,
+      "completions/min_length": 550.0,
+      "completions/min_terminated_length": 550.0,
+      "epoch": 0.168,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.32145801186561584,
+      "learning_rate": 2.854966364683872e-07,
+      "loss": 0.0,
+      "num_tokens": 14798054.0,
+      "reward": 0.7505484819412231,
+      "reward_std": 0.5473448634147644,
+      "rewards/cosine_scaled_reward/mean": -0.07003828883171082,
+      "rewards/cosine_scaled_reward/std": 0.4046306014060974,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1844.0,
+      "completions/mean_length": 1062.828125,
+      "completions/mean_terminated_length": 960.913818359375,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "epoch": 0.16914285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2667451500892639,
+      "learning_rate": 2.791832395815782e-07,
+      "loss": -0.0,
+      "num_tokens": 14877259.0,
+      "reward": 0.7823130488395691,
+      "reward_std": 0.48230016231536865,
+      "rewards/cosine_scaled_reward/mean": -0.06978099048137665,
+      "rewards/cosine_scaled_reward/std": 0.37567150592803955,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2020.0,
+      "completions/mean_length": 1386.875,
+      "completions/mean_terminated_length": 1086.3636474609375,
+      "completions/min_length": 439.0,
+      "completions/min_terminated_length": 439.0,
+      "epoch": 0.1702857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2730913758277893,
+      "learning_rate": 2.729523361034538e-07,
+      "loss": 0.0,
+      "num_tokens": 14977915.0,
+      "reward": 0.48214927315711975,
+      "reward_std": 0.8376681804656982,
+      "rewards/cosine_scaled_reward/mean": -0.14173786342144012,
+      "rewards/cosine_scaled_reward/std": 0.4272434711456299,
+      "rewards/format_reward/mean": 0.765625,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1831.0,
+      "completions/mean_length": 994.15625,
+      "completions/mean_terminated_length": 942.3278198242188,
+      "completions/min_length": 322.0,
+      "completions/min_terminated_length": 322.0,
+      "epoch": 0.17142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2946690022945404,
+      "learning_rate": 2.6680582402757324e-07,
+      "loss": -0.0,
+      "num_tokens": 15052045.0,
+      "reward": 0.8893749713897705,
+      "reward_std": 0.7130615711212158,
+      "rewards/cosine_scaled_reward/mean": -0.05531252920627594,
+      "rewards/cosine_scaled_reward/std": 0.4389563202857971,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1975.0,
+      "completions/mean_length": 1094.4375,
+      "completions/mean_terminated_length": 917.8518676757812,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 0.17257142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29988256096839905,
+      "learning_rate": 2.6074557564105724e-07,
+      "loss": 0.0,
+      "num_tokens": 15132769.0,
+      "reward": 1.088501214981079,
+      "reward_std": 0.9213382005691528,
+      "rewards/cosine_scaled_reward/mean": 0.10675054788589478,
+      "rewards/cosine_scaled_reward/std": 0.510394811630249,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1976.0,
+      "completions/mean_length": 1024.203125,
+      "completions/mean_terminated_length": 937.440673828125,
+      "completions/min_length": 140.0,
+      "completions/min_terminated_length": 140.0,
+      "epoch": 0.1737142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.46614158153533936,
+      "learning_rate": 2.547734369542718e-07,
+      "loss": -0.0,
+      "num_tokens": 15208982.0,
+      "reward": 0.7280048131942749,
+      "reward_std": 0.706195592880249,
+      "rewards/cosine_scaled_reward/mean": -0.10474759340286255,
+      "rewards/cosine_scaled_reward/std": 0.45987388491630554,
+      "rewards/format_reward/mean": 0.9375,
+      "rewards/format_reward/std": 0.24397502839565277,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1955.0,
+      "completions/mean_length": 1180.234375,
+      "completions/mean_terminated_length": 1056.2679443359375,
+      "completions/min_length": 298.0,
+      "completions/min_terminated_length": 298.0,
+      "epoch": 0.17485714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33488133549690247,
+      "learning_rate": 2.488912271385139e-07,
+      "loss": -0.0,
+      "num_tokens": 15295661.0,
+      "reward": 0.4985957443714142,
+      "reward_std": 0.4677598178386688,
+      "rewards/cosine_scaled_reward/mean": -0.2272646427154541,
+      "rewards/cosine_scaled_reward/std": 0.2307518571615219,
+      "rewards/format_reward/mean": 0.953125,
+      "rewards/format_reward/std": 0.21304203569889069,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2019.0,
+      "completions/mean_length": 1340.296875,
+      "completions/mean_terminated_length": 1142.1400146484375,
+      "completions/min_length": 559.0,
+      "completions/min_terminated_length": 559.0,
+      "epoch": 0.176,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25304633378982544,
+      "learning_rate": 2.4310073797187573e-07,
+      "loss": -0.0,
+      "num_tokens": 15392504.0,
+      "reward": 0.7636169195175171,
+      "reward_std": 0.7114115953445435,
+      "rewards/cosine_scaled_reward/mean": -0.03225403279066086,
+      "rewards/cosine_scaled_reward/std": 0.42686402797698975,
+      "rewards/format_reward/mean": 0.828125,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1909.0,
+      "completions/mean_length": 915.6875,
+      "completions/mean_terminated_length": 798.5516967773438,
+      "completions/min_length": 315.0,
+      "completions/min_terminated_length": 315.0,
+      "epoch": 0.17714285714285713,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.35448068380355835,
+      "learning_rate": 2.374037332934512e-07,
+      "loss": 0.0,
+      "num_tokens": 15461732.0,
+      "reward": 0.736025333404541,
+      "reward_std": 0.5466883182525635,
+      "rewards/cosine_scaled_reward/mean": -0.11636234819889069,
+      "rewards/cosine_scaled_reward/std": 0.43356192111968994,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1913.0,
+      "completions/mean_length": 1157.90625,
+      "completions/mean_terminated_length": 952.5000610351562,
+      "completions/min_length": 93.0,
+      "completions/min_terminated_length": 93.0,
+      "epoch": 0.1782857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.4357910454273224,
+      "learning_rate": 2.3180194846605364e-07,
+      "loss": -0.0,
+      "num_tokens": 15545942.0,
+      "reward": 0.8330824971199036,
+      "reward_std": 0.725536048412323,
+      "rewards/cosine_scaled_reward/mean": -0.02095877379179001,
+      "rewards/cosine_scaled_reward/std": 0.4767586290836334,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2028.0,
+      "completions/mean_length": 1157.75,
+      "completions/mean_terminated_length": 1030.571533203125,
+      "completions/min_length": 485.0,
+      "completions/min_terminated_length": 485.0,
+      "epoch": 0.17942857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29891225695610046,
+      "learning_rate": 2.2629708984760706e-07,
+      "loss": 0.0,
+      "num_tokens": 15629998.0,
+      "reward": 0.6674755811691284,
+      "reward_std": 0.6577311754226685,
+      "rewards/cosine_scaled_reward/mean": -0.13501222431659698,
+      "rewards/cosine_scaled_reward/std": 0.36102381348609924,
+      "rewards/format_reward/mean": 0.9375,
+      "rewards/format_reward/std": 0.24397502839565277,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1856.0,
+      "completions/mean_length": 1013.6875,
+      "completions/mean_terminated_length": 962.8196411132812,
+      "completions/min_length": 324.0,
+      "completions/min_terminated_length": 324.0,
+      "epoch": 0.18057142857142858,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.2723560333251953,
+      "learning_rate": 2.2089083427137329e-07,
+      "loss": 0.0,
+      "num_tokens": 15704994.0,
+      "reward": 0.9709224104881287,
+      "reward_std": 0.48810505867004395,
+      "rewards/cosine_scaled_reward/mean": -0.014538809657096863,
+      "rewards/cosine_scaled_reward/std": 0.4970093369483948,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1881.0,
+      "completions/mean_length": 1081.296875,
+      "completions/mean_terminated_length": 962.5789794921875,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "epoch": 0.18171428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2894439697265625,
+      "learning_rate": 2.1558482853517253e-07,
+      "loss": -0.0,
+      "num_tokens": 15785877.0,
+      "reward": 0.5938807725906372,
+      "reward_std": 0.592242956161499,
+      "rewards/cosine_scaled_reward/mean": -0.16399714350700378,
+      "rewards/cosine_scaled_reward/std": 0.3423241078853607,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1750.0,
+      "completions/mean_length": 968.25,
+      "completions/mean_terminated_length": 915.1474609375,
+      "completions/min_length": 417.0,
+      "completions/min_terminated_length": 417.0,
+      "epoch": 0.18285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3261898159980774,
+      "learning_rate": 2.1038068889975259e-07,
+      "loss": 0.0,
+      "num_tokens": 15859429.0,
+      "reward": 1.2050117254257202,
+      "reward_std": 0.6944217681884766,
+      "rewards/cosine_scaled_reward/mean": 0.10250584781169891,
+      "rewards/cosine_scaled_reward/std": 0.5283173322677612,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2026.0,
+      "completions/mean_length": 1031.75,
+      "completions/mean_terminated_length": 945.6271362304688,
+      "completions/min_length": 414.0,
+      "completions/min_terminated_length": 414.0,
+      "epoch": 0.184,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34274861216545105,
+      "learning_rate": 2.0528000059645995e-07,
+      "loss": -0.0,
+      "num_tokens": 15935453.0,
+      "reward": 0.9563960433006287,
+      "reward_std": 0.6316370964050293,
+      "rewards/cosine_scaled_reward/mean": 0.009448029100894928,
+      "rewards/cosine_scaled_reward/std": 0.46292582154273987,
+      "rewards/format_reward/mean": 0.9375,
+      "rewards/format_reward/std": 0.24397502839565277,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.234375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1891.0,
+      "completions/mean_length": 1167.828125,
+      "completions/mean_terminated_length": 898.3877563476562,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "epoch": 0.18514285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3887297511100769,
+      "learning_rate": 2.0028431734436308e-07,
+      "loss": 0.0,
+      "num_tokens": 16020498.0,
+      "reward": 0.6932262182235718,
+      "reward_std": 0.8278101682662964,
+      "rewards/cosine_scaled_reward/mean": -0.08307439833879471,
+      "rewards/cosine_scaled_reward/std": 0.3847581744194031,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1820.0,
+      "completions/mean_length": 1058.84375,
+      "completions/mean_terminated_length": 956.5172119140625,
+      "completions/min_length": 394.0,
+      "completions/min_terminated_length": 394.0,
+      "epoch": 0.18628571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30917680263519287,
+      "learning_rate": 1.9539516087697517e-07,
+      "loss": 0.0,
+      "num_tokens": 16099448.0,
+      "reward": 1.3529155254364014,
+      "reward_std": 0.8906396627426147,
+      "rewards/cosine_scaled_reward/mean": 0.22333277761936188,
+      "rewards/cosine_scaled_reward/std": 0.5322388410568237,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1974.0,
+      "completions/mean_length": 988.703125,
+      "completions/mean_terminated_length": 918.0833740234375,
+      "completions/min_length": 317.0,
+      "completions/min_terminated_length": 317.0,
+      "epoch": 0.18742857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33646658062934875,
+      "learning_rate": 1.9061402047871833e-07,
+      "loss": 0.0,
+      "num_tokens": 16173253.0,
+      "reward": 1.046778678894043,
+      "reward_std": 0.6892427206039429,
+      "rewards/cosine_scaled_reward/mean": 0.0390143096446991,
+      "rewards/cosine_scaled_reward/std": 0.4476637840270996,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2033.0,
+      "completions/mean_length": 1103.5,
+      "completions/mean_terminated_length": 948.9454345703125,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "epoch": 0.18857142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.338925838470459,
+      "learning_rate": 1.8594235253127372e-07,
+      "loss": -0.0,
+      "num_tokens": 16255293.0,
+      "reward": 0.7887892723083496,
+      "reward_std": 0.6329070329666138,
+      "rewards/cosine_scaled_reward/mean": -0.0665428563952446,
+      "rewards/cosine_scaled_reward/std": 0.4880979061126709,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1714.0,
+      "completions/mean_length": 1166.265625,
+      "completions/mean_terminated_length": 1002.9815063476562,
+      "completions/min_length": 500.0,
+      "completions/min_terminated_length": 500.0,
+      "epoch": 0.18971428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29118841886520386,
+      "learning_rate": 1.8138158006995363e-07,
+      "loss": -0.0,
+      "num_tokens": 16341510.0,
+      "reward": 0.5021259784698486,
+      "reward_std": 0.5949545502662659,
+      "rewards/cosine_scaled_reward/mean": -0.18643701076507568,
+      "rewards/cosine_scaled_reward/std": 0.3388413190841675,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1756.0,
+      "completions/mean_length": 1027.96875,
+      "completions/mean_terminated_length": 922.4482421875,
+      "completions/min_length": 370.0,
+      "completions/min_terminated_length": 370.0,
+      "epoch": 0.19085714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3123703598976135,
+      "learning_rate": 1.7693309235023127e-07,
+      "loss": -0.0,
+      "num_tokens": 16418844.0,
+      "reward": 0.6054480671882629,
+      "reward_std": 0.6668864488601685,
+      "rewards/cosine_scaled_reward/mean": -0.17383846640586853,
+      "rewards/cosine_scaled_reward/std": 0.34976449608802795,
+      "rewards/format_reward/mean": 0.953125,
+      "rewards/format_reward/std": 0.21304203569889069,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2033.0,
+      "completions/mean_length": 1131.890625,
+      "completions/mean_terminated_length": 1086.8360595703125,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "epoch": 0.192,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2515013515949249,
+      "learning_rate": 1.7259824442455923e-07,
+      "loss": 0.0,
+      "num_tokens": 16502125.0,
+      "reward": 0.929424524307251,
+      "reward_std": 0.6242066621780396,
+      "rewards/cosine_scaled_reward/mean": -0.011850237846374512,
+      "rewards/cosine_scaled_reward/std": 0.4718935191631317,
+      "rewards/format_reward/mean": 0.953125,
+      "rewards/format_reward/std": 0.21304203569889069,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1737.0,
+      "completions/mean_length": 908.53125,
+      "completions/mean_terminated_length": 871.774169921875,
+      "completions/min_length": 428.0,
+      "completions/min_terminated_length": 428.0,
+      "epoch": 0.19314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29841023683547974,
+      "learning_rate": 1.6837835672960831e-07,
+      "loss": -0.0,
+      "num_tokens": 16570895.0,
+      "reward": 1.6184587478637695,
+      "reward_std": 0.5710533857345581,
+      "rewards/cosine_scaled_reward/mean": 0.3092293441295624,
+      "rewards/cosine_scaled_reward/std": 0.5226604342460632,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2031.0,
+      "completions/mean_length": 1005.109375,
+      "completions/mean_terminated_length": 834.4545288085938,
+      "completions/min_length": 264.0,
+      "completions/min_terminated_length": 264.0,
+      "epoch": 0.19428571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3179849088191986,
+      "learning_rate": 1.6427471468404952e-07,
+      "loss": -0.0,
+      "num_tokens": 16645006.0,
+      "reward": 1.0071099996566772,
+      "reward_std": 0.3746073246002197,
+      "rewards/cosine_scaled_reward/mean": 0.06605499982833862,
+      "rewards/cosine_scaled_reward/std": 0.4378518760204315,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1891.0,
+      "completions/mean_length": 1234.65625,
+      "completions/mean_terminated_length": 940.4680786132812,
+      "completions/min_length": 500.0,
+      "completions/min_terminated_length": 500.0,
+      "epoch": 0.19542857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2992324233055115,
+      "learning_rate": 1.6028856829700258e-07,
+      "loss": -0.0,
+      "num_tokens": 16734416.0,
+      "reward": 0.7108581066131592,
+      "reward_std": 0.7254206538200378,
+      "rewards/cosine_scaled_reward/mean": -0.02738344669342041,
+      "rewards/cosine_scaled_reward/std": 0.44080549478530884,
+      "rewards/format_reward/mean": 0.765625,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1638.0,
+      "completions/mean_length": 900.234375,
+      "completions/mean_terminated_length": 823.7167358398438,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "epoch": 0.19657142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.47149336338043213,
+      "learning_rate": 1.5642113178727193e-07,
+      "loss": 0.0,
+      "num_tokens": 16802647.0,
+      "reward": 1.3995718955993652,
+      "reward_std": 0.5902794599533081,
+      "rewards/cosine_scaled_reward/mean": 0.2310360074043274,
+      "rewards/cosine_scaled_reward/std": 0.5026565194129944,
+      "rewards/format_reward/mean": 0.9375,
+      "rewards/format_reward/std": 0.24397502839565277,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2041.0,
+      "completions/mean_length": 925.078125,
+      "completions/mean_terminated_length": 787.1754150390625,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 0.1977142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3893924057483673,
+      "learning_rate": 1.5267358321348285e-07,
+      "loss": 0.0,
+      "num_tokens": 16873164.0,
+      "reward": 0.6720038056373596,
+      "reward_std": 0.667186975479126,
+      "rewards/cosine_scaled_reward/mean": -0.12493559718132019,
+      "rewards/cosine_scaled_reward/std": 0.40216636657714844,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2040.0,
+      "completions/mean_length": 1005.578125,
+      "completions/mean_terminated_length": 971.9515991210938,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 0.19885714285714284,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.39529484510421753,
+      "learning_rate": 1.4904706411523448e-07,
+      "loss": -0.0,
+      "num_tokens": 16947857.0,
+      "reward": 0.9172019958496094,
+      "reward_std": 0.6198633313179016,
+      "rewards/cosine_scaled_reward/mean": -0.03358650952577591,
+      "rewards/cosine_scaled_reward/std": 0.4403606951236725,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1828.0,
+      "completions/mean_length": 952.296875,
+      "completions/mean_terminated_length": 898.4097900390625,
+      "completions/min_length": 321.0,
+      "completions/min_terminated_length": 321.0,
+      "epoch": 0.2,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.322712779045105,
+      "learning_rate": 1.4554267916537495e-07,
+      "loss": 0.0,
+      "num_tokens": 17019628.0,
+      "reward": 0.871549129486084,
+      "reward_std": 0.46009254455566406,
+      "rewards/cosine_scaled_reward/mean": -0.05641293525695801,
+      "rewards/cosine_scaled_reward/std": 0.44415631890296936,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1975.0,
+      "completions/mean_length": 1048.453125,
+      "completions/mean_terminated_length": 945.0516967773438,
+      "completions/min_length": 287.0,
+      "completions/min_terminated_length": 287.0,
+      "epoch": 0.20114285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3287680447101593,
+      "learning_rate": 1.4216149583350755e-07,
+      "loss": -0.0,
+      "num_tokens": 17097897.0,
+      "reward": 0.839117705821991,
+      "reward_std": 0.7753168344497681,
+      "rewards/cosine_scaled_reward/mean": -0.04137861356139183,
+      "rewards/cosine_scaled_reward/std": 0.43453913927078247,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1915.0,
+      "completions/mean_length": 968.34375,
+      "completions/mean_terminated_length": 933.51611328125,
+      "completions/min_length": 505.0,
+      "completions/min_terminated_length": 505.0,
+      "epoch": 0.2022857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3266870677471161,
+      "learning_rate": 1.3890454406082956e-07,
+      "loss": -0.0,
+      "num_tokens": 17170095.0,
+      "reward": 1.0329997539520264,
+      "reward_std": 0.7290528416633606,
+      "rewards/cosine_scaled_reward/mean": 0.024312350898981094,
+      "rewards/cosine_scaled_reward/std": 0.46764034032821655,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2028.0,
+      "completions/mean_length": 1016.0625,
+      "completions/mean_terminated_length": 909.3103637695312,
+      "completions/min_length": 414.0,
+      "completions/min_terminated_length": 414.0,
+      "epoch": 0.20342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.330020546913147,
+      "learning_rate": 1.3577281594640182e-07,
+      "loss": -0.0,
+      "num_tokens": 17246659.0,
+      "reward": 1.1118203401565552,
+      "reward_std": 0.7913287878036499,
+      "rewards/cosine_scaled_reward/mean": 0.07934767752885818,
+      "rewards/cosine_scaled_reward/std": 0.5148099660873413,
+      "rewards/format_reward/mean": 0.953125,
+      "rewards/format_reward/std": 0.21304203569889069,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.234375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1945.0,
+      "completions/mean_length": 1227.78125,
+      "completions/mean_terminated_length": 976.69384765625,
+      "completions/min_length": 270.0,
+      "completions/min_terminated_length": 270.0,
+      "epoch": 0.20457142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33415722846984863,
+      "learning_rate": 1.3276726544494571e-07,
+      "loss": 0.0,
+      "num_tokens": 17336069.0,
+      "reward": 0.608305037021637,
+      "reward_std": 0.5569274425506592,
+      "rewards/cosine_scaled_reward/mean": -0.10991000384092331,
+      "rewards/cosine_scaled_reward/std": 0.3418741822242737,
+      "rewards/format_reward/mean": 0.828125,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1932.0,
+      "completions/mean_length": 1024.46875,
+      "completions/mean_terminated_length": 956.2333984375,
+      "completions/min_length": 264.0,
+      "completions/min_terminated_length": 264.0,
+      "epoch": 0.2057142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3402194082736969,
+      "learning_rate": 1.2988880807625927e-07,
+      "loss": -0.0,
+      "num_tokens": 17412811.0,
+      "reward": 1.6137604713439941,
+      "reward_std": 0.8008866310119629,
+      "rewards/cosine_scaled_reward/mean": 0.31469273567199707,
+      "rewards/cosine_scaled_reward/std": 0.5089212656021118,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2001.0,
+      "completions/mean_length": 1157.546875,
+      "completions/mean_terminated_length": 992.6481323242188,
+      "completions/min_length": 387.0,
+      "completions/min_terminated_length": 387.0,
+      "epoch": 0.20685714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29356250166893005,
+      "learning_rate": 1.2713832064634125e-07,
+      "loss": -0.0,
+      "num_tokens": 17498366.0,
+      "reward": 0.7507010698318481,
+      "reward_std": 0.5088521242141724,
+      "rewards/cosine_scaled_reward/mean": -0.07777446508407593,
+      "rewards/cosine_scaled_reward/std": 0.4100310504436493,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.234375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1840.0,
+      "completions/mean_length": 1166.390625,
+      "completions/mean_terminated_length": 896.5101928710938,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 0.208,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2653217613697052,
+      "learning_rate": 1.2451664098030743e-07,
+      "loss": -0.0,
+      "num_tokens": 17582807.0,
+      "reward": 0.7447050213813782,
+      "reward_std": 0.8267481327056885,
+      "rewards/cosine_scaled_reward/mean": -0.04170997440814972,
+      "rewards/cosine_scaled_reward/std": 0.4390917420387268,
+      "rewards/format_reward/mean": 0.828125,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1843.0,
+      "completions/mean_length": 1003.203125,
+      "completions/mean_terminated_length": 933.550048828125,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "epoch": 0.20914285714285713,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3268946707248688,
+      "learning_rate": 1.220245676671809e-07,
+      "loss": 0.0,
+      "num_tokens": 17657628.0,
+      "reward": 1.0635898113250732,
+      "reward_std": 0.5967966914176941,
+      "rewards/cosine_scaled_reward/mean": 0.039607420563697815,
+      "rewards/cosine_scaled_reward/std": 0.43730178475379944,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1941.0,
+      "completions/mean_length": 1025.171875,
+      "completions/mean_terminated_length": 938.4915161132812,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 0.2102857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.36050307750701904,
+      "learning_rate": 1.1966285981663407e-07,
+      "loss": 0.0,
+      "num_tokens": 17734591.0,
+      "reward": 0.6448719501495361,
+      "reward_std": 0.503462553024292,
+      "rewards/cosine_scaled_reward/mean": -0.14631402492523193,
+      "rewards/cosine_scaled_reward/std": 0.3733954429626465,
+      "rewards/format_reward/mean": 0.9375,
+      "rewards/format_reward/std": 0.24397502839565277,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1779.0,
+      "completions/mean_length": 969.015625,
+      "completions/mean_terminated_length": 934.2096557617188,
+      "completions/min_length": 107.0,
+      "completions/min_terminated_length": 107.0,
+      "epoch": 0.21142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.42219310998916626,
+      "learning_rate": 1.1743223682775649e-07,
+      "loss": -0.0,
+      "num_tokens": 17806792.0,
+      "reward": 0.7470877766609192,
+      "reward_std": 0.5973426103591919,
+      "rewards/cosine_scaled_reward/mean": -0.11864358186721802,
+      "rewards/cosine_scaled_reward/std": 0.41184645891189575,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1667.0,
+      "completions/mean_length": 1076.984375,
+      "completions/mean_terminated_length": 938.2678833007812,
+      "completions/min_length": 404.0,
+      "completions/min_terminated_length": 404.0,
+      "epoch": 0.21257142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30614498257637024,
+      "learning_rate": 1.1533337816991931e-07,
+      "loss": -0.0,
+      "num_tokens": 17886415.0,
+      "reward": 0.804481029510498,
+      "reward_std": 0.4629480838775635,
+      "rewards/cosine_scaled_reward/mean": -0.03525950014591217,
+      "rewards/cosine_scaled_reward/std": 0.45060867071151733,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1879.0,
+      "completions/mean_length": 1112.1875,
+      "completions/mean_terminated_length": 1049.800048828125,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 0.21371428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.4846937656402588,
+      "learning_rate": 1.1336692317580158e-07,
+      "loss": 0.0,
+      "num_tokens": 17968019.0,
+      "reward": 0.6981200575828552,
+      "reward_std": 0.53022301197052,
+      "rewards/cosine_scaled_reward/mean": -0.1275024712085724,
+      "rewards/cosine_scaled_reward/std": 0.38560083508491516,
+      "rewards/format_reward/mean": 0.953125,
+      "rewards/format_reward/std": 0.21304203569889069,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1910.0,
+      "completions/mean_length": 1079.90625,
+      "completions/mean_terminated_length": 997.8643798828125,
+      "completions/min_length": 295.0,
+      "completions/min_terminated_length": 295.0,
+      "epoch": 0.21485714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34140780568122864,
+      "learning_rate": 1.1153347084664419e-07,
+      "loss": -0.0,
+      "num_tokens": 18048933.0,
+      "reward": 0.5326423645019531,
+      "reward_std": 0.5487440824508667,
+      "rewards/cosine_scaled_reward/mean": -0.22586631774902344,
+      "rewards/cosine_scaled_reward/std": 0.3085760772228241,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2002.0,
+      "completions/mean_length": 868.546875,
+      "completions/mean_terminated_length": 830.5,
+      "completions/min_length": 37.0,
+      "completions/min_terminated_length": 37.0,
+      "epoch": 0.216,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.6607878804206848,
+      "learning_rate": 1.0983357966978745e-07,
+      "loss": -0.0,
+      "num_tokens": 18113808.0,
+      "reward": 0.7490335702896118,
+      "reward_std": 0.6654466390609741,
+      "rewards/cosine_scaled_reward/mean": -0.11767073720693588,
+      "rewards/cosine_scaled_reward/std": 0.4015049338340759,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1713.0,
+      "completions/mean_length": 938.8125,
+      "completions/mean_terminated_length": 903.0322265625,
+      "completions/min_length": 359.0,
+      "completions/min_terminated_length": 359.0,
+      "epoch": 0.21714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3069080710411072,
+      "learning_rate": 1.0826776744855121e-07,
+      "loss": -0.0,
+      "num_tokens": 18183660.0,
+      "reward": 0.9838922023773193,
+      "reward_std": 0.5085676908493042,
+      "rewards/cosine_scaled_reward/mean": -0.00024138391017913818,
+      "rewards/cosine_scaled_reward/std": 0.44459760189056396,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1851.0,
+      "completions/max_terminated_length": 1851.0,
+      "completions/mean_length": 902.453125,
+      "completions/mean_terminated_length": 902.453125,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "epoch": 0.21828571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.35081905126571655,
+      "learning_rate": 1.068365111445064e-07,
+      "loss": 0.0,
+      "num_tokens": 18251705.0,
+      "reward": 1.247175931930542,
+      "reward_std": 0.8716963529586792,
+      "rewards/cosine_scaled_reward/mean": 0.13140051066875458,
+      "rewards/cosine_scaled_reward/std": 0.5292099118232727,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2022.0,
+      "completions/mean_length": 1270.0,
+      "completions/mean_terminated_length": 1052.1600341796875,
+      "completions/min_length": 427.0,
+      "completions/min_terminated_length": 427.0,
+      "epoch": 0.21942857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2882588505744934,
+      "learning_rate": 1.0554024673218806e-07,
+      "loss": -0.0,
+      "num_tokens": 18344281.0,
+      "reward": 0.5913476943969727,
+      "reward_std": 0.6203497052192688,
+      "rewards/cosine_scaled_reward/mean": -0.11057613790035248,
+      "rewards/cosine_scaled_reward/std": 0.33690571784973145,
+      "rewards/format_reward/mean": 0.8125,
+      "rewards/format_reward/std": 0.39339789748191833,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1806.0,
+      "completions/mean_length": 1134.3125,
+      "completions/mean_terminated_length": 1022.1052856445312,
+      "completions/min_length": 475.0,
+      "completions/min_terminated_length": 475.0,
+      "epoch": 0.22057142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30495956540107727,
+      "learning_rate": 1.0437936906629334e-07,
+      "loss": 0.0,
+      "num_tokens": 18428021.0,
+      "reward": 0.9724597930908203,
+      "reward_std": 0.6338238716125488,
+      "rewards/cosine_scaled_reward/mean": 0.025292381644248962,
+      "rewards/cosine_scaled_reward/std": 0.47308972477912903,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.234375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2022.0,
+      "completions/mean_length": 1284.34375,
+      "completions/mean_terminated_length": 1050.5714111328125,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "epoch": 0.22171428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29666370153427124,
+      "learning_rate": 1.0335423176140511e-07,
+      "loss": -0.0,
+      "num_tokens": 18521579.0,
+      "reward": 0.970361590385437,
+      "reward_std": 0.8541973829269409,
+      "rewards/cosine_scaled_reward/mean": 0.055493295192718506,
+      "rewards/cosine_scaled_reward/std": 0.5139825344085693,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1614.0,
+      "completions/mean_length": 1111.140625,
+      "completions/mean_terminated_length": 957.8363037109375,
+      "completions/min_length": 612.0,
+      "completions/min_terminated_length": 612.0,
+      "epoch": 0.22285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2935192883014679,
+      "learning_rate": 1.0246514708427701e-07,
+      "loss": -0.0,
+      "num_tokens": 18603836.0,
+      "reward": 0.9238024353981018,
+      "reward_std": 0.7688024044036865,
+      "rewards/cosine_scaled_reward/mean": 0.008776212111115456,
+      "rewards/cosine_scaled_reward/std": 0.4346567392349243,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1913.0,
+      "completions/mean_length": 1110.28125,
+      "completions/mean_terminated_length": 1064.163818359375,
+      "completions/min_length": 422.0,
+      "completions/min_terminated_length": 422.0,
+      "epoch": 0.224,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31850409507751465,
+      "learning_rate": 1.017123858587145e-07,
+      "loss": 0.0,
+      "num_tokens": 18686486.0,
+      "reward": 1.0064561367034912,
+      "reward_std": 0.6142268776893616,
+      "rewards/cosine_scaled_reward/mean": 0.0032280460000038147,
+      "rewards/cosine_scaled_reward/std": 0.4689313769340515,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1862.0,
+      "completions/max_terminated_length": 1862.0,
+      "completions/mean_length": 867.390625,
+      "completions/mean_terminated_length": 867.390625,
+      "completions/min_length": 294.0,
+      "completions/min_terminated_length": 294.0,
+      "epoch": 0.22514285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.36897119879722595,
+      "learning_rate": 1.0109617738307911e-07,
+      "loss": 0.0,
+      "num_tokens": 18752367.0,
+      "reward": 1.2200298309326172,
+      "reward_std": 0.7840542197227478,
+      "rewards/cosine_scaled_reward/mean": 0.11001493036746979,
+      "rewards/cosine_scaled_reward/std": 0.5105303525924683,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2032.0,
+      "completions/mean_length": 1018.171875,
+      "completions/mean_terminated_length": 911.637939453125,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 0.22628571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33654487133026123,
+      "learning_rate": 1.0061670936044178e-07,
+      "loss": 0.0,
+      "num_tokens": 18829034.0,
+      "reward": 1.0653846263885498,
+      "reward_std": 0.7624523043632507,
+      "rewards/cosine_scaled_reward/mean": 0.04831730201840401,
+      "rewards/cosine_scaled_reward/std": 0.4961619973182678,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2038.0,
+      "completions/mean_length": 1170.84375,
+      "completions/mean_terminated_length": 1096.5084228515625,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "epoch": 0.22742857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28278952836990356,
+      "learning_rate": 1.002741278414069e-07,
+      "loss": 0.0,
+      "num_tokens": 18915472.0,
+      "reward": 0.6831471920013428,
+      "reward_std": 0.6951984167098999,
+      "rewards/cosine_scaled_reward/mean": -0.1506139189004898,
+      "rewards/cosine_scaled_reward/std": 0.34608688950538635,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1764.0,
+      "completions/mean_length": 999.390625,
+      "completions/mean_terminated_length": 849.5892944335938,
+      "completions/min_length": 395.0,
+      "completions/min_terminated_length": 395.0,
+      "epoch": 0.22857142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28817513585090637,
+      "learning_rate": 1.0006853717962393e-07,
+      "loss": 0.0,
+      "num_tokens": 18989553.0,
+      "reward": 0.9030377864837646,
+      "reward_std": 0.8171917200088501,
+      "rewards/cosine_scaled_reward/mean": -0.01723114401102066,
+      "rewards/cosine_scaled_reward/std": 0.4829805791378021,
+      "rewards/format_reward/mean": 0.9375,
+      "rewards/format_reward/std": 0.24397502839565277,
+      "step": 200
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 200,
+  "num_input_tokens_seen": 18989553,
+  "num_train_epochs": 1,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin
new file mode 100644
index 0000000..9e03ee7
--- /dev/null
+++ b/checkpoint-200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec4fbe7e29ae78abab9c9fd5555870c1bffe7656ceef4ac5fa2148a15b61b1e3
+size 8888
diff --git a/checkpoint-200/zero_to_fp32.py b/checkpoint-200/zero_to_fp32.py
new file mode 100644
index 0000000..0e75914
--- /dev/null
+++ b/checkpoint-200/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-50/config.json b/checkpoint-50/config.json
new file mode 100644
index 0000000..78fed5b
--- /dev/null
+++ b/checkpoint-50/config.json
@@ -0,0 +1,29 @@
+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000,
+  "sliding_window": 4096,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": false,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
diff --git a/checkpoint-50/generation_config.json b/checkpoint-50/generation_config.json
new file mode 100644
index 0000000..92878bd
--- /dev/null
+++ b/checkpoint-50/generation_config.json
@@ -0,0 +1,9 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 151646,
+  "do_sample": true,
+  "eos_token_id": 151643,
+  "temperature": 0.6,
+  "top_p": 0.95,
+  "transformers_version": "4.51.3"
+}
diff --git a/checkpoint-50/global_step50/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-50/global_step50/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000..49a88ee
--- /dev/null
+++ b/checkpoint-50/global_step50/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15b6a9782b696b9648759e23b9a015a229cf06021f44496888b5b974982b73d0
+size 5331274140
diff --git a/checkpoint-50/global_step50/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-50/global_step50/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000..28ad062
--- /dev/null
+++ b/checkpoint-50/global_step50/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f8ce47abef1a09396a2597f93a6dab8419eb47fb54f07cef9745062ef5e6150
+size 5331276572
diff --git a/checkpoint-50/global_step50/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-50/global_step50/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000..a564453
--- /dev/null
+++ b/checkpoint-50/global_step50/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:999e75772985222df760220d28f2111b606d3fb2b1158595ae4c2a3dd6426a9b
+size 5331276892
diff --git a/checkpoint-50/global_step50/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-50/global_step50/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000..245973f
--- /dev/null
+++ b/checkpoint-50/global_step50/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a709181aaa3478b82261661698e5e2ae9ccd8b5a5eaa4d6872be8b7346ba0fa5
+size 5331273884
diff --git a/checkpoint-50/global_step50/mp_rank_00_model_states.pt b/checkpoint-50/global_step50/mp_rank_00_model_states.pt
new file mode 100644
index 0000000..0f58570
--- /dev/null
+++ b/checkpoint-50/global_step50/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36c107850937f54f3ab44e97309ceb540caccbb4f6dd4fb455172a625fea33b7
+size 3554267640
diff --git a/checkpoint-50/latest b/checkpoint-50/latest
new file mode 100644
index 0000000..9b4dc80
--- /dev/null
+++ b/checkpoint-50/latest
@@ -0,0 +1 @@
+global_step50
\ No newline at end of file
diff --git a/checkpoint-50/model.safetensors b/checkpoint-50/model.safetensors
new file mode 100644
index 0000000..ee7ca9a
--- /dev/null
+++ b/checkpoint-50/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:afa546fe36442ef1eadd1abe75c4e1cc7f12d3eb73207863f1c8275e5e09d219
+size 3554214752
diff --git a/checkpoint-50/rng_state_0.pth b/checkpoint-50/rng_state_0.pth
new file mode 100644
index 0000000..53282b7
--- /dev/null
+++ b/checkpoint-50/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c144d7042faa07cd1b3e09bc6db4e37259092c2146f9f694ec2741bf55635f63
+size 14960
diff --git a/checkpoint-50/rng_state_1.pth b/checkpoint-50/rng_state_1.pth
new file mode 100644
index 0000000..5ec9370
--- /dev/null
+++ b/checkpoint-50/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f8ad4f83876ffc17dd3af676c2318f13a70ea40dda9ffa3802ec5bc5ad03eb3
+size 14960
diff --git a/checkpoint-50/rng_state_2.pth b/checkpoint-50/rng_state_2.pth
new file mode 100644
index 0000000..77cdca5
--- /dev/null
+++ b/checkpoint-50/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9b346a73b0296c1aab4bbcd1a7c35830287f75f12d6c38a95f96663bc882c00
+size 14960
diff --git a/checkpoint-50/rng_state_3.pth b/checkpoint-50/rng_state_3.pth
new file mode 100644
index 0000000..3ea893c
--- /dev/null
+++ b/checkpoint-50/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0276cfb938f62268a0976c5009fbeeac22aa3317b639d8877e741f6279007a62
+size 14960
diff --git a/checkpoint-50/scheduler.pt b/checkpoint-50/scheduler.pt
new file mode 100644
index 0000000..be8f09d
--- /dev/null
+++ b/checkpoint-50/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0208179da2b605778b21720a99ccfb3d5e515115ee90824c90bfcabf8ad99120
+size 1064
diff --git a/checkpoint-50/special_tokens_map.json b/checkpoint-50/special_tokens_map.json
new file mode 100644
index 0000000..1d385d6
--- /dev/null
+++ b/checkpoint-50/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/checkpoint-50/tokenizer.json b/checkpoint-50/tokenizer.json
new file mode 100644
index 0000000..e7cd2c1
--- /dev/null
+++ b/checkpoint-50/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a
+size 11422959
diff --git a/checkpoint-50/tokenizer_config.json b/checkpoint-50/tokenizer_config.json
new file mode 100644
index 0000000..ef6e98c
--- /dev/null
+++ b/checkpoint-50/tokenizer_config.json
@@ -0,0 +1,195 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\\n'}}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/checkpoint-50/trainer_state.json b/checkpoint-50/trainer_state.json
new file mode 100644
index 0000000..6678a05
--- /dev/null
+++ b/checkpoint-50/trainer_state.json
@@ -0,0 +1,1384 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.05714285714285714,
+  "eval_steps": 500,
+  "global_step": 50,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1734.0,
+      "completions/mean_length": 1702.03125,
+      "completions/mean_terminated_length": 993.6190795898438,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "epoch": 0.001142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2544386684894562,
+      "learning_rate": 0.0,
+      "loss": -0.0,
+      "num_tokens": 118418.0,
+      "reward": 0.17899775505065918,
+      "reward_std": 0.7650213241577148,
+      "rewards/cosine_scaled_reward/mean": -0.09800112992525101,
+      "rewards/cosine_scaled_reward/std": 0.37953105568885803,
+      "rewards/format_reward/mean": 0.375,
+      "rewards/format_reward/std": 0.48795005679130554,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1894.0,
+      "completions/mean_length": 1738.90625,
+      "completions/mean_terminated_length": 949.0,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "epoch": 0.002285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2436082512140274,
+      "learning_rate": 5e-08,
+      "loss": -0.0,
+      "num_tokens": 239748.0,
+      "reward": 0.3848632574081421,
+      "reward_std": 0.9111153483390808,
+      "rewards/cosine_scaled_reward/mean": 0.020556632429361343,
+      "rewards/cosine_scaled_reward/std": 0.4492928683757782,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1545.0,
+      "completions/mean_length": 1989.015625,
+      "completions/mean_terminated_length": 1104.25,
+      "completions/min_length": 706.0,
+      "completions/min_terminated_length": 706.0,
+      "epoch": 0.0034285714285714284,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2544717788696289,
+      "learning_rate": 1e-07,
+      "loss": -0.0,
+      "num_tokens": 377517.0,
+      "reward": -0.3279358148574829,
+      "reward_std": 0.33216947317123413,
+      "rewards/cosine_scaled_reward/mean": -0.20303040742874146,
+      "rewards/cosine_scaled_reward/std": 0.179075226187706,
+      "rewards/format_reward/mean": 0.078125,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2033.0,
+      "completions/mean_length": 1566.421875,
+      "completions/mean_terminated_length": 1084.84375,
+      "completions/min_length": 502.0,
+      "completions/min_terminated_length": 502.0,
+      "epoch": 0.004571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28807103633880615,
+      "learning_rate": 1.5e-07,
+      "loss": -0.0,
+      "num_tokens": 487576.0,
+      "reward": 0.2716121971607208,
+      "reward_std": 0.6643469333648682,
+      "rewards/cosine_scaled_reward/mean": -0.12981891632080078,
+      "rewards/cosine_scaled_reward/std": 0.3019586503505707,
+      "rewards/format_reward/mean": 0.53125,
+      "rewards/format_reward/std": 0.5029674172401428,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.890625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1807.0,
+      "completions/mean_length": 1936.84375,
+      "completions/mean_terminated_length": 1031.71435546875,
+      "completions/min_length": 463.0,
+      "completions/min_terminated_length": 463.0,
+      "epoch": 0.005714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26783761382102966,
+      "learning_rate": 2e-07,
+      "loss": -0.0,
+      "num_tokens": 622350.0,
+      "reward": -0.3612896800041199,
+      "reward_std": 0.41048353910446167,
+      "rewards/cosine_scaled_reward/mean": -0.23533234000205994,
+      "rewards/cosine_scaled_reward/std": 0.20467400550842285,
+      "rewards/format_reward/mean": 0.109375,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1301.0,
+      "completions/mean_length": 1889.453125,
+      "completions/mean_terminated_length": 779.625,
+      "completions/min_length": 530.0,
+      "completions/min_terminated_length": 530.0,
+      "epoch": 0.006857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.262518972158432,
+      "learning_rate": 2.5e-07,
+      "loss": 0.0,
+      "num_tokens": 754923.0,
+      "reward": -0.29250282049179077,
+      "reward_std": 0.5422531962394714,
+      "rewards/cosine_scaled_reward/mean": -0.22437641024589539,
+      "rewards/cosine_scaled_reward/std": 0.22509199380874634,
+      "rewards/format_reward/mean": 0.15625,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1568.0,
+      "completions/mean_length": 1921.921875,
+      "completions/mean_terminated_length": 1314.45458984375,
+      "completions/min_length": 927.0,
+      "completions/min_terminated_length": 927.0,
+      "epoch": 0.008,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22601397335529327,
+      "learning_rate": 3e-07,
+      "loss": 0.0,
+      "num_tokens": 888334.0,
+      "reward": 0.025340259075164795,
+      "reward_std": 0.7285393476486206,
+      "rewards/cosine_scaled_reward/mean": -0.1279548704624176,
+      "rewards/cosine_scaled_reward/std": 0.40222346782684326,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2009.0,
+      "completions/mean_length": 1736.859375,
+      "completions/mean_terminated_length": 999.9473876953125,
+      "completions/min_length": 305.0,
+      "completions/min_terminated_length": 305.0,
+      "epoch": 0.009142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24552854895591736,
+      "learning_rate": 3.5e-07,
+      "loss": 0.0,
+      "num_tokens": 1009909.0,
+      "reward": 0.21729671955108643,
+      "reward_std": 0.6989120244979858,
+      "rewards/cosine_scaled_reward/mean": -0.055414143949747086,
+      "rewards/cosine_scaled_reward/std": 0.47493892908096313,
+      "rewards/format_reward/mean": 0.328125,
+      "rewards/format_reward/std": 0.4732423722743988,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1963.0,
+      "completions/mean_length": 1967.53125,
+      "completions/mean_terminated_length": 1475.77783203125,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "epoch": 0.010285714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2430322915315628,
+      "learning_rate": 4e-07,
+      "loss": 0.0,
+      "num_tokens": 1147287.0,
+      "reward": -0.21451422572135925,
+      "reward_std": 0.587526798248291,
+      "rewards/cosine_scaled_reward/mean": -0.19319462776184082,
+      "rewards/cosine_scaled_reward/std": 0.29357606172561646,
+      "rewards/format_reward/mean": 0.171875,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1966.0,
+      "completions/mean_length": 1708.546875,
+      "completions/mean_terminated_length": 961.75,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "epoch": 0.011428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2543582320213318,
+      "learning_rate": 4.5e-07,
+      "loss": 0.0,
+      "num_tokens": 1267466.0,
+      "reward": 0.02539752423763275,
+      "reward_std": 0.545810341835022,
+      "rewards/cosine_scaled_reward/mean": -0.14355123043060303,
+      "rewards/cosine_scaled_reward/std": 0.36147356033325195,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.90625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1579.0,
+      "completions/mean_length": 1967.734375,
+      "completions/mean_terminated_length": 1191.8333740234375,
+      "completions/min_length": 843.0,
+      "completions/min_terminated_length": 843.0,
+      "epoch": 0.012571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24583907425403595,
+      "learning_rate": 5e-07,
+      "loss": -0.0,
+      "num_tokens": 1405073.0,
+      "reward": -0.46971434354782104,
+      "reward_std": 0.36104393005371094,
+      "rewards/cosine_scaled_reward/mean": -0.28173214197158813,
+      "rewards/cosine_scaled_reward/std": 0.17775526642799377,
+      "rewards/format_reward/mean": 0.09375,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1862.0,
+      "completions/mean_length": 1707.5625,
+      "completions/mean_terminated_length": 1176.47998046875,
+      "completions/min_length": 330.0,
+      "completions/min_terminated_length": 330.0,
+      "epoch": 0.013714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3135142922401428,
+      "learning_rate": 5.5e-07,
+      "loss": -0.0,
+      "num_tokens": 1525301.0,
+      "reward": 0.0018395520746707916,
+      "reward_std": 0.7012988328933716,
+      "rewards/cosine_scaled_reward/mean": -0.21783021092414856,
+      "rewards/cosine_scaled_reward/std": 0.324150949716568,
+      "rewards/format_reward/mean": 0.4375,
+      "rewards/format_reward/std": 0.5,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1745.0,
+      "completions/mean_length": 1841.96875,
+      "completions/mean_terminated_length": 1168.933349609375,
+      "completions/min_length": 442.0,
+      "completions/min_terminated_length": 442.0,
+      "epoch": 0.014857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2532394826412201,
+      "learning_rate": 6e-07,
+      "loss": -0.0,
+      "num_tokens": 1654227.0,
+      "reward": -0.10322706401348114,
+      "reward_std": 0.6915165185928345,
+      "rewards/cosine_scaled_reward/mean": -0.17661353945732117,
+      "rewards/cosine_scaled_reward/std": 0.329875111579895,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2017.0,
+      "completions/mean_length": 1816.390625,
+      "completions/mean_terminated_length": 1306.8499755859375,
+      "completions/min_length": 520.0,
+      "completions/min_terminated_length": 520.0,
+      "epoch": 0.016,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28405147790908813,
+      "learning_rate": 6.5e-07,
+      "loss": 0.0,
+      "num_tokens": 1781084.0,
+      "reward": 0.10602855682373047,
+      "reward_std": 0.630502462387085,
+      "rewards/cosine_scaled_reward/mean": -0.11104822158813477,
+      "rewards/cosine_scaled_reward/std": 0.3846627473831177,
+      "rewards/format_reward/mean": 0.328125,
+      "rewards/format_reward/std": 0.4732423722743988,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1574.0,
+      "completions/mean_length": 1702.109375,
+      "completions/mean_terminated_length": 818.1666870117188,
+      "completions/min_length": 411.0,
+      "completions/min_terminated_length": 411.0,
+      "epoch": 0.017142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28779250383377075,
+      "learning_rate": 7e-07,
+      "loss": 0.0,
+      "num_tokens": 1900939.0,
+      "reward": 0.32734519243240356,
+      "reward_std": 0.3870265483856201,
+      "rewards/cosine_scaled_reward/mean": 0.007422588765621185,
+      "rewards/cosine_scaled_reward/std": 0.45787373185157776,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 2048.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 2048.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.018285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2337152510881424,
+      "learning_rate": 7.5e-07,
+      "loss": -0.0,
+      "num_tokens": 2042451.0,
+      "reward": -0.5429925918579102,
+      "reward_std": 0.3153150975704193,
+      "rewards/cosine_scaled_reward/mean": -0.2714962661266327,
+      "rewards/cosine_scaled_reward/std": 0.1678173691034317,
+      "rewards/format_reward/mean": 0.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1879.0,
+      "completions/mean_length": 1564.921875,
+      "completions/mean_terminated_length": 858.8846435546875,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "epoch": 0.019428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33599403500556946,
+      "learning_rate": 8e-07,
+      "loss": -0.0,
+      "num_tokens": 2153126.0,
+      "reward": 0.17696775496006012,
+      "reward_std": 0.6489306688308716,
+      "rewards/cosine_scaled_reward/mean": -0.11464111506938934,
+      "rewards/cosine_scaled_reward/std": 0.3551919758319855,
+      "rewards/format_reward/mean": 0.40625,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1298.0,
+      "completions/mean_length": 1795.390625,
+      "completions/mean_terminated_length": 893.21435546875,
+      "completions/min_length": 619.0,
+      "completions/min_terminated_length": 619.0,
+      "epoch": 0.02057142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22697053849697113,
+      "learning_rate": 8.499999999999999e-07,
+      "loss": -0.0,
+      "num_tokens": 2278407.0,
+      "reward": -0.10711958259344101,
+      "reward_std": 0.5238703489303589,
+      "rewards/cosine_scaled_reward/mean": -0.1785597801208496,
+      "rewards/cosine_scaled_reward/std": 0.2545098662376404,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1949.0,
+      "completions/mean_length": 1921.484375,
+      "completions/mean_terminated_length": 1238.300048828125,
+      "completions/min_length": 623.0,
+      "completions/min_terminated_length": 623.0,
+      "epoch": 0.021714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23972108960151672,
+      "learning_rate": 9e-07,
+      "loss": 0.0,
+      "num_tokens": 2412638.0,
+      "reward": 0.029344379901885986,
+      "reward_std": 0.6719281077384949,
+      "rewards/cosine_scaled_reward/mean": -0.086890310049057,
+      "rewards/cosine_scaled_reward/std": 0.40220555663108826,
+      "rewards/format_reward/mean": 0.203125,
+      "rewards/format_reward/std": 0.40550529956817627,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2025.0,
+      "completions/mean_length": 1728.5625,
+      "completions/mean_terminated_length": 845.4117431640625,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "epoch": 0.022857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23309311270713806,
+      "learning_rate": 9.499999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 2534618.0,
+      "reward": 0.0131673663854599,
+      "reward_std": 0.4436222314834595,
+      "rewards/cosine_scaled_reward/mean": -0.13404130935668945,
+      "rewards/cosine_scaled_reward/std": 0.32819250226020813,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1923.0,
+      "completions/mean_length": 1777.953125,
+      "completions/mean_terminated_length": 1087.8333740234375,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "epoch": 0.024,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29990270733833313,
+      "learning_rate": 1e-06,
+      "loss": -0.0,
+      "num_tokens": 2659215.0,
+      "reward": -0.1764472872018814,
+      "reward_std": 0.5121938586235046,
+      "rewards/cosine_scaled_reward/mean": -0.2444736361503601,
+      "rewards/cosine_scaled_reward/std": 0.289971262216568,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.390625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1935.0,
+      "completions/mean_length": 1361.28125,
+      "completions/mean_terminated_length": 921.0769653320312,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 0.025142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29922786355018616,
+      "learning_rate": 9.99931462820376e-07,
+      "loss": -0.0,
+      "num_tokens": 2755353.0,
+      "reward": 0.6089149713516235,
+      "reward_std": 0.5986809730529785,
+      "rewards/cosine_scaled_reward/mean": -0.05491749942302704,
+      "rewards/cosine_scaled_reward/std": 0.39076483249664307,
+      "rewards/format_reward/mean": 0.71875,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2043.0,
+      "completions/mean_length": 1565.046875,
+      "completions/mean_terminated_length": 903.2222290039062,
+      "completions/min_length": 405.0,
+      "completions/min_terminated_length": 405.0,
+      "epoch": 0.026285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27512773871421814,
+      "learning_rate": 9.997258721585931e-07,
+      "loss": -0.0,
+      "num_tokens": 2866308.0,
+      "reward": 0.21871733665466309,
+      "reward_std": 0.5976030826568604,
+      "rewards/cosine_scaled_reward/mean": -0.10157884657382965,
+      "rewards/cosine_scaled_reward/std": 0.3856185972690582,
+      "rewards/format_reward/mean": 0.421875,
+      "rewards/format_reward/std": 0.49776285886764526,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1947.0,
+      "completions/mean_length": 1801.671875,
+      "completions/mean_terminated_length": 1259.75,
+      "completions/min_length": 573.0,
+      "completions/min_terminated_length": 573.0,
+      "epoch": 0.027428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22642865777015686,
+      "learning_rate": 9.993832906395582e-07,
+      "loss": -0.0,
+      "num_tokens": 2992543.0,
+      "reward": 0.04899948835372925,
+      "reward_std": 0.8525694608688354,
+      "rewards/cosine_scaled_reward/mean": -0.17081275582313538,
+      "rewards/cosine_scaled_reward/std": 0.3993513882160187,
+      "rewards/format_reward/mean": 0.390625,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2020.0,
+      "completions/mean_length": 1715.765625,
+      "completions/mean_terminated_length": 1035.4761962890625,
+      "completions/min_length": 436.0,
+      "completions/min_terminated_length": 436.0,
+      "epoch": 0.02857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25316134095191956,
+      "learning_rate": 9.989038226169207e-07,
+      "loss": -0.0,
+      "num_tokens": 3112648.0,
+      "reward": 0.10585837811231613,
+      "reward_std": 0.7828943729400635,
+      "rewards/cosine_scaled_reward/mean": -0.11894579976797104,
+      "rewards/cosine_scaled_reward/std": 0.4141720235347748,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1964.0,
+      "completions/mean_length": 1917.703125,
+      "completions/mean_terminated_length": 1452.357177734375,
+      "completions/min_length": 840.0,
+      "completions/min_terminated_length": 840.0,
+      "epoch": 0.029714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2521306574344635,
+      "learning_rate": 9.982876141412855e-07,
+      "loss": -0.0,
+      "num_tokens": 3246013.0,
+      "reward": 0.17620250582695007,
+      "reward_std": 0.6548349857330322,
+      "rewards/cosine_scaled_reward/mean": -0.08377375453710556,
+      "rewards/cosine_scaled_reward/std": 0.3527655303478241,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1990.0,
+      "completions/mean_length": 1851.015625,
+      "completions/mean_terminated_length": 1147.5,
+      "completions/min_length": 506.0,
+      "completions/min_terminated_length": 506.0,
+      "epoch": 0.030857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2730060815811157,
+      "learning_rate": 9.975348529157229e-07,
+      "loss": -0.0,
+      "num_tokens": 3374766.0,
+      "reward": -0.18854813277721405,
+      "reward_std": 0.49348777532577515,
+      "rewards/cosine_scaled_reward/mean": -0.21146157383918762,
+      "rewards/cosine_scaled_reward/std": 0.2601618766784668,
+      "rewards/format_reward/mean": 0.234375,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2023.0,
+      "completions/mean_length": 1798.328125,
+      "completions/mean_terminated_length": 1049.3125,
+      "completions/min_length": 403.0,
+      "completions/min_terminated_length": 403.0,
+      "epoch": 0.032,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2566036880016327,
+      "learning_rate": 9.96645768238595e-07,
+      "loss": 0.0,
+      "num_tokens": 3500195.0,
+      "reward": 0.06705980002880096,
+      "reward_std": 0.7090284824371338,
+      "rewards/cosine_scaled_reward/mean": -0.10709509253501892,
+      "rewards/cosine_scaled_reward/std": 0.4101051986217499,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1988.0,
+      "completions/mean_length": 1930.203125,
+      "completions/mean_terminated_length": 1210.3333740234375,
+      "completions/min_length": 582.0,
+      "completions/min_terminated_length": 582.0,
+      "epoch": 0.03314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25197461247444153,
+      "learning_rate": 9.956206309337066e-07,
+      "loss": 0.0,
+      "num_tokens": 3634200.0,
+      "reward": -0.2462695688009262,
+      "reward_std": 0.5237302780151367,
+      "rewards/cosine_scaled_reward/mean": -0.2012597918510437,
+      "rewards/cosine_scaled_reward/std": 0.23252712190151215,
+      "rewards/format_reward/mean": 0.15625,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1900.0,
+      "completions/mean_length": 1847.65625,
+      "completions/mean_terminated_length": 1061.6923828125,
+      "completions/min_length": 421.0,
+      "completions/min_terminated_length": 421.0,
+      "epoch": 0.03428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30431485176086426,
+      "learning_rate": 9.944597532678119e-07,
+      "loss": 0.0,
+      "num_tokens": 3762986.0,
+      "reward": -0.05392302945256233,
+      "reward_std": 0.7249555587768555,
+      "rewards/cosine_scaled_reward/mean": -0.15196150541305542,
+      "rewards/cosine_scaled_reward/std": 0.34566983580589294,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1860.0,
+      "completions/mean_length": 1838.671875,
+      "completions/mean_terminated_length": 931.5833740234375,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "epoch": 0.03542857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2484513372182846,
+      "learning_rate": 9.931634888554935e-07,
+      "loss": 0.0,
+      "num_tokens": 3891157.0,
+      "reward": -0.11271396279335022,
+      "reward_std": 0.6705260872840881,
+      "rewards/cosine_scaled_reward/mean": -0.1813569962978363,
+      "rewards/cosine_scaled_reward/std": 0.4071698486804962,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1715.0,
+      "completions/mean_length": 1910.109375,
+      "completions/mean_terminated_length": 1417.6429443359375,
+      "completions/min_length": 906.0,
+      "completions/min_terminated_length": 906.0,
+      "epoch": 0.036571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25329527258872986,
+      "learning_rate": 9.917322325514487e-07,
+      "loss": -0.0,
+      "num_tokens": 4023756.0,
+      "reward": -0.08931556344032288,
+      "reward_std": 0.6381070613861084,
+      "rewards/cosine_scaled_reward/mean": -0.16965776681900024,
+      "rewards/cosine_scaled_reward/std": 0.37385129928588867,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.953125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1865.0,
+      "completions/mean_length": 2023.71875,
+      "completions/mean_terminated_length": 1530.0,
+      "completions/min_length": 1107.0,
+      "completions/min_terminated_length": 1107.0,
+      "epoch": 0.037714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22758109867572784,
+      "learning_rate": 9.901664203302124e-07,
+      "loss": 0.0,
+      "num_tokens": 4164490.0,
+      "reward": -0.4589868187904358,
+      "reward_std": 0.5177067518234253,
+      "rewards/cosine_scaled_reward/mean": -0.2919934093952179,
+      "rewards/cosine_scaled_reward/std": 0.2252870500087738,
+      "rewards/format_reward/mean": 0.125,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.453125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1999.0,
+      "completions/mean_length": 1454.78125,
+      "completions/mean_terminated_length": 963.2571411132812,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "epoch": 0.038857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3234354257583618,
+      "learning_rate": 9.88466529153356e-07,
+      "loss": 0.0,
+      "num_tokens": 4267148.0,
+      "reward": 0.656031608581543,
+      "reward_std": 0.7529654502868652,
+      "rewards/cosine_scaled_reward/mean": 0.05457830801606178,
+      "rewards/cosine_scaled_reward/std": 0.49684229493141174,
+      "rewards/format_reward/mean": 0.546875,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1724.0,
+      "completions/mean_length": 1819.078125,
+      "completions/mean_terminated_length": 716.0909423828125,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "epoch": 0.04,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2821458876132965,
+      "learning_rate": 9.866330768241983e-07,
+      "loss": -0.0,
+      "num_tokens": 4395065.0,
+      "reward": -0.09630556404590607,
+      "reward_std": 0.7089139223098755,
+      "rewards/cosine_scaled_reward/mean": -0.15752778947353363,
+      "rewards/cosine_scaled_reward/std": 0.3647947609424591,
+      "rewards/format_reward/mean": 0.21875,
+      "rewards/format_reward/std": 0.4166666865348816,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1811.0,
+      "completions/mean_length": 1954.34375,
+      "completions/mean_terminated_length": 1382.0,
+      "completions/min_length": 949.0,
+      "completions/min_terminated_length": 949.0,
+      "epoch": 0.04114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24163897335529327,
+      "learning_rate": 9.846666218300807e-07,
+      "loss": -0.0,
+      "num_tokens": 4531255.0,
+      "reward": -0.34593287110328674,
+      "reward_std": 0.44493502378463745,
+      "rewards/cosine_scaled_reward/mean": -0.24327893555164337,
+      "rewards/cosine_scaled_reward/std": 0.24784433841705322,
+      "rewards/format_reward/mean": 0.140625,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1723.0,
+      "completions/mean_length": 1868.921875,
+      "completions/mean_terminated_length": 1092.916748046875,
+      "completions/min_length": 620.0,
+      "completions/min_terminated_length": 620.0,
+      "epoch": 0.04228571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24795544147491455,
+      "learning_rate": 9.825677631722435e-07,
+      "loss": -0.0,
+      "num_tokens": 4661890.0,
+      "reward": -0.23053905367851257,
+      "reward_std": 0.34036368131637573,
+      "rewards/cosine_scaled_reward/mean": -0.2246445268392563,
+      "rewards/cosine_scaled_reward/std": 0.15942412614822388,
+      "rewards/format_reward/mean": 0.21875,
+      "rewards/format_reward/std": 0.4166666865348816,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1397.0,
+      "completions/mean_length": 1889.53125,
+      "completions/mean_terminated_length": 1033.800048828125,
+      "completions/min_length": 810.0,
+      "completions/min_terminated_length": 810.0,
+      "epoch": 0.04342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24283826351165771,
+      "learning_rate": 9.80337140183366e-07,
+      "loss": 0.0,
+      "num_tokens": 4794532.0,
+      "reward": -0.10043507814407349,
+      "reward_std": 0.47925832867622375,
+      "rewards/cosine_scaled_reward/mean": -0.13615503907203674,
+      "rewards/cosine_scaled_reward/std": 0.3336707651615143,
+      "rewards/format_reward/mean": 0.171875,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1515.0,
+      "completions/mean_length": 1644.828125,
+      "completions/mean_terminated_length": 689.9473876953125,
+      "completions/min_length": 279.0,
+      "completions/min_terminated_length": 279.0,
+      "epoch": 0.044571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28362998366355896,
+      "learning_rate": 9.779754323328192e-07,
+      "loss": 0.0,
+      "num_tokens": 4910585.0,
+      "reward": 0.12284853309392929,
+      "reward_std": 0.4183085858821869,
+      "rewards/cosine_scaled_reward/mean": -0.11045074462890625,
+      "rewards/cosine_scaled_reward/std": 0.30217844247817993,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1632.0,
+      "completions/mean_length": 1618.28125,
+      "completions/mean_terminated_length": 902.0833740234375,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "epoch": 0.045714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.262617826461792,
+      "learning_rate": 9.754833590196926e-07,
+      "loss": 0.0,
+      "num_tokens": 5024227.0,
+      "reward": 0.2076582908630371,
+      "reward_std": 0.42125773429870605,
+      "rewards/cosine_scaled_reward/mean": -0.12273336946964264,
+      "rewards/cosine_scaled_reward/std": 0.4404613971710205,
+      "rewards/format_reward/mean": 0.453125,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1914.0,
+      "completions/mean_length": 1717.734375,
+      "completions/mean_terminated_length": 1235.0384521484375,
+      "completions/min_length": 664.0,
+      "completions/min_terminated_length": 664.0,
+      "epoch": 0.046857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23294499516487122,
+      "learning_rate": 9.728616793536587e-07,
+      "loss": -0.0,
+      "num_tokens": 5145314.0,
+      "reward": 0.011502981185913086,
+      "reward_std": 0.6816084980964661,
+      "rewards/cosine_scaled_reward/mean": -0.22081100940704346,
+      "rewards/cosine_scaled_reward/std": 0.37589573860168457,
+      "rewards/format_reward/mean": 0.453125,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1672.0,
+      "completions/mean_length": 1703.921875,
+      "completions/mean_terminated_length": 579.933349609375,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 0.048,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34672290086746216,
+      "learning_rate": 9.701111919237408e-07,
+      "loss": -0.0,
+      "num_tokens": 5264725.0,
+      "reward": -0.2616002857685089,
+      "reward_std": 0.37952175736427307,
+      "rewards/cosine_scaled_reward/mean": -0.26361262798309326,
+      "rewards/cosine_scaled_reward/std": 0.17531204223632812,
+      "rewards/format_reward/mean": 0.265625,
+      "rewards/format_reward/std": 0.44515693187713623,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1370.0,
+      "completions/mean_length": 1681.84375,
+      "completions/mean_terminated_length": 814.631591796875,
+      "completions/min_length": 308.0,
+      "completions/min_terminated_length": 308.0,
+      "epoch": 0.04914285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.263967901468277,
+      "learning_rate": 9.672327345550543e-07,
+      "loss": -0.0,
+      "num_tokens": 5383979.0,
+      "reward": 0.13376155495643616,
+      "reward_std": 0.46012288331985474,
+      "rewards/cosine_scaled_reward/mean": -0.08155670762062073,
+      "rewards/cosine_scaled_reward/std": 0.3612325191497803,
+      "rewards/format_reward/mean": 0.296875,
+      "rewards/format_reward/std": 0.4604927599430084,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.640625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1830.0,
+      "completions/mean_length": 1624.625,
+      "completions/mean_terminated_length": 869.9130859375,
+      "completions/min_length": 385.0,
+      "completions/min_terminated_length": 385.0,
+      "epoch": 0.05028571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28927963972091675,
+      "learning_rate": 9.64227184053598e-07,
+      "loss": -0.0,
+      "num_tokens": 5498651.0,
+      "reward": 0.20869271457195282,
+      "reward_std": 0.5558150410652161,
+      "rewards/cosine_scaled_reward/mean": -0.0987786278128624,
+      "rewards/cosine_scaled_reward/std": 0.42912590503692627,
+      "rewards/format_reward/mean": 0.40625,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.921875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1851.0,
+      "completions/mean_length": 2006.96875,
+      "completions/mean_terminated_length": 1522.800048828125,
+      "completions/min_length": 955.0,
+      "completions/min_terminated_length": 955.0,
+      "epoch": 0.05142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24254000186920166,
+      "learning_rate": 9.610954559391704e-07,
+      "loss": 0.0,
+      "num_tokens": 5638753.0,
+      "reward": -0.2540697157382965,
+      "reward_std": 0.4600578844547272,
+      "rewards/cosine_scaled_reward/mean": -0.20515984296798706,
+      "rewards/cosine_scaled_reward/std": 0.3251590430736542,
+      "rewards/format_reward/mean": 0.15625,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1563.0,
+      "completions/mean_length": 1765.984375,
+      "completions/mean_terminated_length": 919.9375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "epoch": 0.052571428571428575,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2645930349826813,
+      "learning_rate": 9.578385041664925e-07,
+      "loss": 0.0,
+      "num_tokens": 5762944.0,
+      "reward": -0.213707834482193,
+      "reward_std": 0.38778313994407654,
+      "rewards/cosine_scaled_reward/mean": -0.2318539321422577,
+      "rewards/cosine_scaled_reward/std": 0.21436986327171326,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1993.0,
+      "completions/mean_length": 1583.40625,
+      "completions/mean_terminated_length": 986.0714721679688,
+      "completions/min_length": 436.0,
+      "completions/min_terminated_length": 436.0,
+      "epoch": 0.053714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.311797559261322,
+      "learning_rate": 9.54457320834625e-07,
+      "loss": 0.0,
+      "num_tokens": 5874682.0,
+      "reward": 0.27925533056259155,
+      "reward_std": 0.6467443704605103,
+      "rewards/cosine_scaled_reward/mean": -0.07912233471870422,
+      "rewards/cosine_scaled_reward/std": 0.4737093150615692,
+      "rewards/format_reward/mean": 0.4375,
+      "rewards/format_reward/std": 0.5,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1527.0,
+      "completions/mean_length": 1690.0625,
+      "completions/mean_terminated_length": 1006.727294921875,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "epoch": 0.054857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26644304394721985,
+      "learning_rate": 9.509529358847654e-07,
+      "loss": -0.0,
+      "num_tokens": 5993390.0,
+      "reward": 0.13692031800746918,
+      "reward_std": 0.5655145049095154,
+      "rewards/cosine_scaled_reward/mean": -0.12685233354568481,
+      "rewards/cosine_scaled_reward/std": 0.32320985198020935,
+      "rewards/format_reward/mean": 0.390625,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2021.0,
+      "completions/mean_length": 1387.140625,
+      "completions/mean_terminated_length": 804.0294189453125,
+      "completions/min_length": 300.0,
+      "completions/min_terminated_length": 300.0,
+      "epoch": 0.056,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3078882396221161,
+      "learning_rate": 9.473264167865171e-07,
+      "loss": 0.0,
+      "num_tokens": 6092231.0,
+      "reward": 0.35559189319610596,
+      "reward_std": 0.5927403569221497,
+      "rewards/cosine_scaled_reward/mean": -0.09564155340194702,
+      "rewards/cosine_scaled_reward/std": 0.4046906530857086,
+      "rewards/format_reward/mean": 0.546875,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1830.0,
+      "completions/mean_length": 1674.890625,
+      "completions/mean_terminated_length": 962.5909423828125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "epoch": 0.05714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23925544321537018,
+      "learning_rate": 9.43578868212728e-07,
+      "loss": -0.0,
+      "num_tokens": 6210240.0,
+      "reward": 0.18573230504989624,
+      "reward_std": 0.5264967083930969,
+      "rewards/cosine_scaled_reward/mean": -0.09463384002447128,
+      "rewards/cosine_scaled_reward/std": 0.4100942015647888,
+      "rewards/format_reward/mean": 0.375,
+      "rewards/format_reward/std": 0.48795005679130554,
+      "step": 50
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 200,
+  "num_input_tokens_seen": 6210240,
+  "num_train_epochs": 1,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-50/training_args.bin b/checkpoint-50/training_args.bin
new file mode 100644
index 0000000..9e03ee7
--- /dev/null
+++ b/checkpoint-50/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec4fbe7e29ae78abab9c9fd5555870c1bffe7656ceef4ac5fa2148a15b61b1e3
+size 8888
diff --git a/checkpoint-50/zero_to_fp32.py b/checkpoint-50/zero_to_fp32.py
new file mode 100644
index 0000000..0e75914
--- /dev/null
+++ b/checkpoint-50/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/config.json b/config.json
new file mode 100644
index 0000000..ae35726
--- /dev/null
+++ b/config.json
@@ -0,0 +1,29 @@
+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000,
+  "sliding_window": 4096,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": true,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000..92878bd
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,9 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 151646,
+  "do_sample": true,
+  "eos_token_id": 151643,
+  "temperature": 0.6,
+  "top_p": 0.95,
+  "transformers_version": "4.51.3"
+}
diff --git a/model.safetensors b/model.safetensors
new file mode 100644
index 0000000..24c1613
--- /dev/null
+++ b/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:721e7cd7a52fbe85031e588ef9dd53b84820dc30295efc7a202ec5bf16e6a44d
+size 3554214752
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000..1d385d6
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000..e7cd2c1
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a
+size 11422959
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000..ef6e98c
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,195 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\\n'}}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/train_results.json b/train_results.json
new file mode 100644
index 0000000..2c27fe5
--- /dev/null
+++ b/train_results.json
@@ -0,0 +1,8 @@
+{
+    "total_flos": 0.0,
+    "train_loss": 3.2957177609205244e-09,
+    "train_runtime": 10011.2078,
+    "train_samples": 7000,
+    "train_samples_per_second": 1.279,
+    "train_steps_per_second": 0.02
+}
\ No newline at end of file
diff --git a/trainer_state.json b/trainer_state.json
new file mode 100644
index 0000000..fff1ad1
--- /dev/null
+++ b/trainer_state.json
@@ -0,0 +1,5443 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.22857142857142856,
+  "eval_steps": 500,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1734.0,
+      "completions/mean_length": 1702.03125,
+      "completions/mean_terminated_length": 993.6190795898438,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "epoch": 0.001142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2544386684894562,
+      "learning_rate": 0.0,
+      "loss": -0.0,
+      "num_tokens": 118418.0,
+      "reward": 0.17899775505065918,
+      "reward_std": 0.7650213241577148,
+      "rewards/cosine_scaled_reward/mean": -0.09800112992525101,
+      "rewards/cosine_scaled_reward/std": 0.37953105568885803,
+      "rewards/format_reward/mean": 0.375,
+      "rewards/format_reward/std": 0.48795005679130554,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1894.0,
+      "completions/mean_length": 1738.90625,
+      "completions/mean_terminated_length": 949.0,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "epoch": 0.002285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2436082512140274,
+      "learning_rate": 5e-08,
+      "loss": -0.0,
+      "num_tokens": 239748.0,
+      "reward": 0.3848632574081421,
+      "reward_std": 0.9111153483390808,
+      "rewards/cosine_scaled_reward/mean": 0.020556632429361343,
+      "rewards/cosine_scaled_reward/std": 0.4492928683757782,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1545.0,
+      "completions/mean_length": 1989.015625,
+      "completions/mean_terminated_length": 1104.25,
+      "completions/min_length": 706.0,
+      "completions/min_terminated_length": 706.0,
+      "epoch": 0.0034285714285714284,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2544717788696289,
+      "learning_rate": 1e-07,
+      "loss": -0.0,
+      "num_tokens": 377517.0,
+      "reward": -0.3279358148574829,
+      "reward_std": 0.33216947317123413,
+      "rewards/cosine_scaled_reward/mean": -0.20303040742874146,
+      "rewards/cosine_scaled_reward/std": 0.179075226187706,
+      "rewards/format_reward/mean": 0.078125,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2033.0,
+      "completions/mean_length": 1566.421875,
+      "completions/mean_terminated_length": 1084.84375,
+      "completions/min_length": 502.0,
+      "completions/min_terminated_length": 502.0,
+      "epoch": 0.004571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28807103633880615,
+      "learning_rate": 1.5e-07,
+      "loss": -0.0,
+      "num_tokens": 487576.0,
+      "reward": 0.2716121971607208,
+      "reward_std": 0.6643469333648682,
+      "rewards/cosine_scaled_reward/mean": -0.12981891632080078,
+      "rewards/cosine_scaled_reward/std": 0.3019586503505707,
+      "rewards/format_reward/mean": 0.53125,
+      "rewards/format_reward/std": 0.5029674172401428,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.890625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1807.0,
+      "completions/mean_length": 1936.84375,
+      "completions/mean_terminated_length": 1031.71435546875,
+      "completions/min_length": 463.0,
+      "completions/min_terminated_length": 463.0,
+      "epoch": 0.005714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26783761382102966,
+      "learning_rate": 2e-07,
+      "loss": -0.0,
+      "num_tokens": 622350.0,
+      "reward": -0.3612896800041199,
+      "reward_std": 0.41048353910446167,
+      "rewards/cosine_scaled_reward/mean": -0.23533234000205994,
+      "rewards/cosine_scaled_reward/std": 0.20467400550842285,
+      "rewards/format_reward/mean": 0.109375,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1301.0,
+      "completions/mean_length": 1889.453125,
+      "completions/mean_terminated_length": 779.625,
+      "completions/min_length": 530.0,
+      "completions/min_terminated_length": 530.0,
+      "epoch": 0.006857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.262518972158432,
+      "learning_rate": 2.5e-07,
+      "loss": 0.0,
+      "num_tokens": 754923.0,
+      "reward": -0.29250282049179077,
+      "reward_std": 0.5422531962394714,
+      "rewards/cosine_scaled_reward/mean": -0.22437641024589539,
+      "rewards/cosine_scaled_reward/std": 0.22509199380874634,
+      "rewards/format_reward/mean": 0.15625,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1568.0,
+      "completions/mean_length": 1921.921875,
+      "completions/mean_terminated_length": 1314.45458984375,
+      "completions/min_length": 927.0,
+      "completions/min_terminated_length": 927.0,
+      "epoch": 0.008,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22601397335529327,
+      "learning_rate": 3e-07,
+      "loss": 0.0,
+      "num_tokens": 888334.0,
+      "reward": 0.025340259075164795,
+      "reward_std": 0.7285393476486206,
+      "rewards/cosine_scaled_reward/mean": -0.1279548704624176,
+      "rewards/cosine_scaled_reward/std": 0.40222346782684326,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2009.0,
+      "completions/mean_length": 1736.859375,
+      "completions/mean_terminated_length": 999.9473876953125,
+      "completions/min_length": 305.0,
+      "completions/min_terminated_length": 305.0,
+      "epoch": 0.009142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24552854895591736,
+      "learning_rate": 3.5e-07,
+      "loss": 0.0,
+      "num_tokens": 1009909.0,
+      "reward": 0.21729671955108643,
+      "reward_std": 0.6989120244979858,
+      "rewards/cosine_scaled_reward/mean": -0.055414143949747086,
+      "rewards/cosine_scaled_reward/std": 0.47493892908096313,
+      "rewards/format_reward/mean": 0.328125,
+      "rewards/format_reward/std": 0.4732423722743988,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1963.0,
+      "completions/mean_length": 1967.53125,
+      "completions/mean_terminated_length": 1475.77783203125,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "epoch": 0.010285714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2430322915315628,
+      "learning_rate": 4e-07,
+      "loss": 0.0,
+      "num_tokens": 1147287.0,
+      "reward": -0.21451422572135925,
+      "reward_std": 0.587526798248291,
+      "rewards/cosine_scaled_reward/mean": -0.19319462776184082,
+      "rewards/cosine_scaled_reward/std": 0.29357606172561646,
+      "rewards/format_reward/mean": 0.171875,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1966.0,
+      "completions/mean_length": 1708.546875,
+      "completions/mean_terminated_length": 961.75,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "epoch": 0.011428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2543582320213318,
+      "learning_rate": 4.5e-07,
+      "loss": 0.0,
+      "num_tokens": 1267466.0,
+      "reward": 0.02539752423763275,
+      "reward_std": 0.545810341835022,
+      "rewards/cosine_scaled_reward/mean": -0.14355123043060303,
+      "rewards/cosine_scaled_reward/std": 0.36147356033325195,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.90625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1579.0,
+      "completions/mean_length": 1967.734375,
+      "completions/mean_terminated_length": 1191.8333740234375,
+      "completions/min_length": 843.0,
+      "completions/min_terminated_length": 843.0,
+      "epoch": 0.012571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24583907425403595,
+      "learning_rate": 5e-07,
+      "loss": -0.0,
+      "num_tokens": 1405073.0,
+      "reward": -0.46971434354782104,
+      "reward_std": 0.36104393005371094,
+      "rewards/cosine_scaled_reward/mean": -0.28173214197158813,
+      "rewards/cosine_scaled_reward/std": 0.17775526642799377,
+      "rewards/format_reward/mean": 0.09375,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1862.0,
+      "completions/mean_length": 1707.5625,
+      "completions/mean_terminated_length": 1176.47998046875,
+      "completions/min_length": 330.0,
+      "completions/min_terminated_length": 330.0,
+      "epoch": 0.013714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3135142922401428,
+      "learning_rate": 5.5e-07,
+      "loss": -0.0,
+      "num_tokens": 1525301.0,
+      "reward": 0.0018395520746707916,
+      "reward_std": 0.7012988328933716,
+      "rewards/cosine_scaled_reward/mean": -0.21783021092414856,
+      "rewards/cosine_scaled_reward/std": 0.324150949716568,
+      "rewards/format_reward/mean": 0.4375,
+      "rewards/format_reward/std": 0.5,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1745.0,
+      "completions/mean_length": 1841.96875,
+      "completions/mean_terminated_length": 1168.933349609375,
+      "completions/min_length": 442.0,
+      "completions/min_terminated_length": 442.0,
+      "epoch": 0.014857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2532394826412201,
+      "learning_rate": 6e-07,
+      "loss": -0.0,
+      "num_tokens": 1654227.0,
+      "reward": -0.10322706401348114,
+      "reward_std": 0.6915165185928345,
+      "rewards/cosine_scaled_reward/mean": -0.17661353945732117,
+      "rewards/cosine_scaled_reward/std": 0.329875111579895,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2017.0,
+      "completions/mean_length": 1816.390625,
+      "completions/mean_terminated_length": 1306.8499755859375,
+      "completions/min_length": 520.0,
+      "completions/min_terminated_length": 520.0,
+      "epoch": 0.016,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28405147790908813,
+      "learning_rate": 6.5e-07,
+      "loss": 0.0,
+      "num_tokens": 1781084.0,
+      "reward": 0.10602855682373047,
+      "reward_std": 0.630502462387085,
+      "rewards/cosine_scaled_reward/mean": -0.11104822158813477,
+      "rewards/cosine_scaled_reward/std": 0.3846627473831177,
+      "rewards/format_reward/mean": 0.328125,
+      "rewards/format_reward/std": 0.4732423722743988,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1574.0,
+      "completions/mean_length": 1702.109375,
+      "completions/mean_terminated_length": 818.1666870117188,
+      "completions/min_length": 411.0,
+      "completions/min_terminated_length": 411.0,
+      "epoch": 0.017142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28779250383377075,
+      "learning_rate": 7e-07,
+      "loss": 0.0,
+      "num_tokens": 1900939.0,
+      "reward": 0.32734519243240356,
+      "reward_std": 0.3870265483856201,
+      "rewards/cosine_scaled_reward/mean": 0.007422588765621185,
+      "rewards/cosine_scaled_reward/std": 0.45787373185157776,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 2048.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 2048.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.018285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2337152510881424,
+      "learning_rate": 7.5e-07,
+      "loss": -0.0,
+      "num_tokens": 2042451.0,
+      "reward": -0.5429925918579102,
+      "reward_std": 0.3153150975704193,
+      "rewards/cosine_scaled_reward/mean": -0.2714962661266327,
+      "rewards/cosine_scaled_reward/std": 0.1678173691034317,
+      "rewards/format_reward/mean": 0.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1879.0,
+      "completions/mean_length": 1564.921875,
+      "completions/mean_terminated_length": 858.8846435546875,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "epoch": 0.019428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33599403500556946,
+      "learning_rate": 8e-07,
+      "loss": -0.0,
+      "num_tokens": 2153126.0,
+      "reward": 0.17696775496006012,
+      "reward_std": 0.6489306688308716,
+      "rewards/cosine_scaled_reward/mean": -0.11464111506938934,
+      "rewards/cosine_scaled_reward/std": 0.3551919758319855,
+      "rewards/format_reward/mean": 0.40625,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1298.0,
+      "completions/mean_length": 1795.390625,
+      "completions/mean_terminated_length": 893.21435546875,
+      "completions/min_length": 619.0,
+      "completions/min_terminated_length": 619.0,
+      "epoch": 0.02057142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22697053849697113,
+      "learning_rate": 8.499999999999999e-07,
+      "loss": -0.0,
+      "num_tokens": 2278407.0,
+      "reward": -0.10711958259344101,
+      "reward_std": 0.5238703489303589,
+      "rewards/cosine_scaled_reward/mean": -0.1785597801208496,
+      "rewards/cosine_scaled_reward/std": 0.2545098662376404,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1949.0,
+      "completions/mean_length": 1921.484375,
+      "completions/mean_terminated_length": 1238.300048828125,
+      "completions/min_length": 623.0,
+      "completions/min_terminated_length": 623.0,
+      "epoch": 0.021714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23972108960151672,
+      "learning_rate": 9e-07,
+      "loss": 0.0,
+      "num_tokens": 2412638.0,
+      "reward": 0.029344379901885986,
+      "reward_std": 0.6719281077384949,
+      "rewards/cosine_scaled_reward/mean": -0.086890310049057,
+      "rewards/cosine_scaled_reward/std": 0.40220555663108826,
+      "rewards/format_reward/mean": 0.203125,
+      "rewards/format_reward/std": 0.40550529956817627,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2025.0,
+      "completions/mean_length": 1728.5625,
+      "completions/mean_terminated_length": 845.4117431640625,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "epoch": 0.022857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23309311270713806,
+      "learning_rate": 9.499999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 2534618.0,
+      "reward": 0.0131673663854599,
+      "reward_std": 0.4436222314834595,
+      "rewards/cosine_scaled_reward/mean": -0.13404130935668945,
+      "rewards/cosine_scaled_reward/std": 0.32819250226020813,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1923.0,
+      "completions/mean_length": 1777.953125,
+      "completions/mean_terminated_length": 1087.8333740234375,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "epoch": 0.024,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29990270733833313,
+      "learning_rate": 1e-06,
+      "loss": -0.0,
+      "num_tokens": 2659215.0,
+      "reward": -0.1764472872018814,
+      "reward_std": 0.5121938586235046,
+      "rewards/cosine_scaled_reward/mean": -0.2444736361503601,
+      "rewards/cosine_scaled_reward/std": 0.289971262216568,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.390625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1935.0,
+      "completions/mean_length": 1361.28125,
+      "completions/mean_terminated_length": 921.0769653320312,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 0.025142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29922786355018616,
+      "learning_rate": 9.99931462820376e-07,
+      "loss": -0.0,
+      "num_tokens": 2755353.0,
+      "reward": 0.6089149713516235,
+      "reward_std": 0.5986809730529785,
+      "rewards/cosine_scaled_reward/mean": -0.05491749942302704,
+      "rewards/cosine_scaled_reward/std": 0.39076483249664307,
+      "rewards/format_reward/mean": 0.71875,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2043.0,
+      "completions/mean_length": 1565.046875,
+      "completions/mean_terminated_length": 903.2222290039062,
+      "completions/min_length": 405.0,
+      "completions/min_terminated_length": 405.0,
+      "epoch": 0.026285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27512773871421814,
+      "learning_rate": 9.997258721585931e-07,
+      "loss": -0.0,
+      "num_tokens": 2866308.0,
+      "reward": 0.21871733665466309,
+      "reward_std": 0.5976030826568604,
+      "rewards/cosine_scaled_reward/mean": -0.10157884657382965,
+      "rewards/cosine_scaled_reward/std": 0.3856185972690582,
+      "rewards/format_reward/mean": 0.421875,
+      "rewards/format_reward/std": 0.49776285886764526,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1947.0,
+      "completions/mean_length": 1801.671875,
+      "completions/mean_terminated_length": 1259.75,
+      "completions/min_length": 573.0,
+      "completions/min_terminated_length": 573.0,
+      "epoch": 0.027428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22642865777015686,
+      "learning_rate": 9.993832906395582e-07,
+      "loss": -0.0,
+      "num_tokens": 2992543.0,
+      "reward": 0.04899948835372925,
+      "reward_std": 0.8525694608688354,
+      "rewards/cosine_scaled_reward/mean": -0.17081275582313538,
+      "rewards/cosine_scaled_reward/std": 0.3993513882160187,
+      "rewards/format_reward/mean": 0.390625,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2020.0,
+      "completions/mean_length": 1715.765625,
+      "completions/mean_terminated_length": 1035.4761962890625,
+      "completions/min_length": 436.0,
+      "completions/min_terminated_length": 436.0,
+      "epoch": 0.02857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25316134095191956,
+      "learning_rate": 9.989038226169207e-07,
+      "loss": -0.0,
+      "num_tokens": 3112648.0,
+      "reward": 0.10585837811231613,
+      "reward_std": 0.7828943729400635,
+      "rewards/cosine_scaled_reward/mean": -0.11894579976797104,
+      "rewards/cosine_scaled_reward/std": 0.4141720235347748,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1964.0,
+      "completions/mean_length": 1917.703125,
+      "completions/mean_terminated_length": 1452.357177734375,
+      "completions/min_length": 840.0,
+      "completions/min_terminated_length": 840.0,
+      "epoch": 0.029714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2521306574344635,
+      "learning_rate": 9.982876141412855e-07,
+      "loss": -0.0,
+      "num_tokens": 3246013.0,
+      "reward": 0.17620250582695007,
+      "reward_std": 0.6548349857330322,
+      "rewards/cosine_scaled_reward/mean": -0.08377375453710556,
+      "rewards/cosine_scaled_reward/std": 0.3527655303478241,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1990.0,
+      "completions/mean_length": 1851.015625,
+      "completions/mean_terminated_length": 1147.5,
+      "completions/min_length": 506.0,
+      "completions/min_terminated_length": 506.0,
+      "epoch": 0.030857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2730060815811157,
+      "learning_rate": 9.975348529157229e-07,
+      "loss": -0.0,
+      "num_tokens": 3374766.0,
+      "reward": -0.18854813277721405,
+      "reward_std": 0.49348777532577515,
+      "rewards/cosine_scaled_reward/mean": -0.21146157383918762,
+      "rewards/cosine_scaled_reward/std": 0.2601618766784668,
+      "rewards/format_reward/mean": 0.234375,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2023.0,
+      "completions/mean_length": 1798.328125,
+      "completions/mean_terminated_length": 1049.3125,
+      "completions/min_length": 403.0,
+      "completions/min_terminated_length": 403.0,
+      "epoch": 0.032,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2566036880016327,
+      "learning_rate": 9.96645768238595e-07,
+      "loss": 0.0,
+      "num_tokens": 3500195.0,
+      "reward": 0.06705980002880096,
+      "reward_std": 0.7090284824371338,
+      "rewards/cosine_scaled_reward/mean": -0.10709509253501892,
+      "rewards/cosine_scaled_reward/std": 0.4101051986217499,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1988.0,
+      "completions/mean_length": 1930.203125,
+      "completions/mean_terminated_length": 1210.3333740234375,
+      "completions/min_length": 582.0,
+      "completions/min_terminated_length": 582.0,
+      "epoch": 0.03314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25197461247444153,
+      "learning_rate": 9.956206309337066e-07,
+      "loss": 0.0,
+      "num_tokens": 3634200.0,
+      "reward": -0.2462695688009262,
+      "reward_std": 0.5237302780151367,
+      "rewards/cosine_scaled_reward/mean": -0.2012597918510437,
+      "rewards/cosine_scaled_reward/std": 0.23252712190151215,
+      "rewards/format_reward/mean": 0.15625,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1900.0,
+      "completions/mean_length": 1847.65625,
+      "completions/mean_terminated_length": 1061.6923828125,
+      "completions/min_length": 421.0,
+      "completions/min_terminated_length": 421.0,
+      "epoch": 0.03428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30431485176086426,
+      "learning_rate": 9.944597532678119e-07,
+      "loss": 0.0,
+      "num_tokens": 3762986.0,
+      "reward": -0.05392302945256233,
+      "reward_std": 0.7249555587768555,
+      "rewards/cosine_scaled_reward/mean": -0.15196150541305542,
+      "rewards/cosine_scaled_reward/std": 0.34566983580589294,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1860.0,
+      "completions/mean_length": 1838.671875,
+      "completions/mean_terminated_length": 931.5833740234375,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "epoch": 0.03542857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2484513372182846,
+      "learning_rate": 9.931634888554935e-07,
+      "loss": 0.0,
+      "num_tokens": 3891157.0,
+      "reward": -0.11271396279335022,
+      "reward_std": 0.6705260872840881,
+      "rewards/cosine_scaled_reward/mean": -0.1813569962978363,
+      "rewards/cosine_scaled_reward/std": 0.4071698486804962,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1715.0,
+      "completions/mean_length": 1910.109375,
+      "completions/mean_terminated_length": 1417.6429443359375,
+      "completions/min_length": 906.0,
+      "completions/min_terminated_length": 906.0,
+      "epoch": 0.036571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25329527258872986,
+      "learning_rate": 9.917322325514487e-07,
+      "loss": -0.0,
+      "num_tokens": 4023756.0,
+      "reward": -0.08931556344032288,
+      "reward_std": 0.6381070613861084,
+      "rewards/cosine_scaled_reward/mean": -0.16965776681900024,
+      "rewards/cosine_scaled_reward/std": 0.37385129928588867,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.953125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1865.0,
+      "completions/mean_length": 2023.71875,
+      "completions/mean_terminated_length": 1530.0,
+      "completions/min_length": 1107.0,
+      "completions/min_terminated_length": 1107.0,
+      "epoch": 0.037714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22758109867572784,
+      "learning_rate": 9.901664203302124e-07,
+      "loss": 0.0,
+      "num_tokens": 4164490.0,
+      "reward": -0.4589868187904358,
+      "reward_std": 0.5177067518234253,
+      "rewards/cosine_scaled_reward/mean": -0.2919934093952179,
+      "rewards/cosine_scaled_reward/std": 0.2252870500087738,
+      "rewards/format_reward/mean": 0.125,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.453125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1999.0,
+      "completions/mean_length": 1454.78125,
+      "completions/mean_terminated_length": 963.2571411132812,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "epoch": 0.038857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3234354257583618,
+      "learning_rate": 9.88466529153356e-07,
+      "loss": 0.0,
+      "num_tokens": 4267148.0,
+      "reward": 0.656031608581543,
+      "reward_std": 0.7529654502868652,
+      "rewards/cosine_scaled_reward/mean": 0.05457830801606178,
+      "rewards/cosine_scaled_reward/std": 0.49684229493141174,
+      "rewards/format_reward/mean": 0.546875,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1724.0,
+      "completions/mean_length": 1819.078125,
+      "completions/mean_terminated_length": 716.0909423828125,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "epoch": 0.04,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2821458876132965,
+      "learning_rate": 9.866330768241983e-07,
+      "loss": -0.0,
+      "num_tokens": 4395065.0,
+      "reward": -0.09630556404590607,
+      "reward_std": 0.7089139223098755,
+      "rewards/cosine_scaled_reward/mean": -0.15752778947353363,
+      "rewards/cosine_scaled_reward/std": 0.3647947609424591,
+      "rewards/format_reward/mean": 0.21875,
+      "rewards/format_reward/std": 0.4166666865348816,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1811.0,
+      "completions/mean_length": 1954.34375,
+      "completions/mean_terminated_length": 1382.0,
+      "completions/min_length": 949.0,
+      "completions/min_terminated_length": 949.0,
+      "epoch": 0.04114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24163897335529327,
+      "learning_rate": 9.846666218300807e-07,
+      "loss": -0.0,
+      "num_tokens": 4531255.0,
+      "reward": -0.34593287110328674,
+      "reward_std": 0.44493502378463745,
+      "rewards/cosine_scaled_reward/mean": -0.24327893555164337,
+      "rewards/cosine_scaled_reward/std": 0.24784433841705322,
+      "rewards/format_reward/mean": 0.140625,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1723.0,
+      "completions/mean_length": 1868.921875,
+      "completions/mean_terminated_length": 1092.916748046875,
+      "completions/min_length": 620.0,
+      "completions/min_terminated_length": 620.0,
+      "epoch": 0.04228571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24795544147491455,
+      "learning_rate": 9.825677631722435e-07,
+      "loss": -0.0,
+      "num_tokens": 4661890.0,
+      "reward": -0.23053905367851257,
+      "reward_std": 0.34036368131637573,
+      "rewards/cosine_scaled_reward/mean": -0.2246445268392563,
+      "rewards/cosine_scaled_reward/std": 0.15942412614822388,
+      "rewards/format_reward/mean": 0.21875,
+      "rewards/format_reward/std": 0.4166666865348816,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1397.0,
+      "completions/mean_length": 1889.53125,
+      "completions/mean_terminated_length": 1033.800048828125,
+      "completions/min_length": 810.0,
+      "completions/min_terminated_length": 810.0,
+      "epoch": 0.04342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24283826351165771,
+      "learning_rate": 9.80337140183366e-07,
+      "loss": 0.0,
+      "num_tokens": 4794532.0,
+      "reward": -0.10043507814407349,
+      "reward_std": 0.47925832867622375,
+      "rewards/cosine_scaled_reward/mean": -0.13615503907203674,
+      "rewards/cosine_scaled_reward/std": 0.3336707651615143,
+      "rewards/format_reward/mean": 0.171875,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1515.0,
+      "completions/mean_length": 1644.828125,
+      "completions/mean_terminated_length": 689.9473876953125,
+      "completions/min_length": 279.0,
+      "completions/min_terminated_length": 279.0,
+      "epoch": 0.044571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28362998366355896,
+      "learning_rate": 9.779754323328192e-07,
+      "loss": 0.0,
+      "num_tokens": 4910585.0,
+      "reward": 0.12284853309392929,
+      "reward_std": 0.4183085858821869,
+      "rewards/cosine_scaled_reward/mean": -0.11045074462890625,
+      "rewards/cosine_scaled_reward/std": 0.30217844247817993,
+      "rewards/format_reward/mean": 0.34375,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1632.0,
+      "completions/mean_length": 1618.28125,
+      "completions/mean_terminated_length": 902.0833740234375,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "epoch": 0.045714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.262617826461792,
+      "learning_rate": 9.754833590196926e-07,
+      "loss": 0.0,
+      "num_tokens": 5024227.0,
+      "reward": 0.2076582908630371,
+      "reward_std": 0.42125773429870605,
+      "rewards/cosine_scaled_reward/mean": -0.12273336946964264,
+      "rewards/cosine_scaled_reward/std": 0.4404613971710205,
+      "rewards/format_reward/mean": 0.453125,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1914.0,
+      "completions/mean_length": 1717.734375,
+      "completions/mean_terminated_length": 1235.0384521484375,
+      "completions/min_length": 664.0,
+      "completions/min_terminated_length": 664.0,
+      "epoch": 0.046857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23294499516487122,
+      "learning_rate": 9.728616793536587e-07,
+      "loss": -0.0,
+      "num_tokens": 5145314.0,
+      "reward": 0.011502981185913086,
+      "reward_std": 0.6816084980964661,
+      "rewards/cosine_scaled_reward/mean": -0.22081100940704346,
+      "rewards/cosine_scaled_reward/std": 0.37589573860168457,
+      "rewards/format_reward/mean": 0.453125,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1672.0,
+      "completions/mean_length": 1703.921875,
+      "completions/mean_terminated_length": 579.933349609375,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 0.048,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34672290086746216,
+      "learning_rate": 9.701111919237408e-07,
+      "loss": -0.0,
+      "num_tokens": 5264725.0,
+      "reward": -0.2616002857685089,
+      "reward_std": 0.37952175736427307,
+      "rewards/cosine_scaled_reward/mean": -0.26361262798309326,
+      "rewards/cosine_scaled_reward/std": 0.17531204223632812,
+      "rewards/format_reward/mean": 0.265625,
+      "rewards/format_reward/std": 0.44515693187713623,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1370.0,
+      "completions/mean_length": 1681.84375,
+      "completions/mean_terminated_length": 814.631591796875,
+      "completions/min_length": 308.0,
+      "completions/min_terminated_length": 308.0,
+      "epoch": 0.04914285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.263967901468277,
+      "learning_rate": 9.672327345550543e-07,
+      "loss": -0.0,
+      "num_tokens": 5383979.0,
+      "reward": 0.13376155495643616,
+      "reward_std": 0.46012288331985474,
+      "rewards/cosine_scaled_reward/mean": -0.08155670762062073,
+      "rewards/cosine_scaled_reward/std": 0.3612325191497803,
+      "rewards/format_reward/mean": 0.296875,
+      "rewards/format_reward/std": 0.4604927599430084,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.640625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1830.0,
+      "completions/mean_length": 1624.625,
+      "completions/mean_terminated_length": 869.9130859375,
+      "completions/min_length": 385.0,
+      "completions/min_terminated_length": 385.0,
+      "epoch": 0.05028571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28927963972091675,
+      "learning_rate": 9.64227184053598e-07,
+      "loss": -0.0,
+      "num_tokens": 5498651.0,
+      "reward": 0.20869271457195282,
+      "reward_std": 0.5558150410652161,
+      "rewards/cosine_scaled_reward/mean": -0.0987786278128624,
+      "rewards/cosine_scaled_reward/std": 0.42912590503692627,
+      "rewards/format_reward/mean": 0.40625,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.921875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1851.0,
+      "completions/mean_length": 2006.96875,
+      "completions/mean_terminated_length": 1522.800048828125,
+      "completions/min_length": 955.0,
+      "completions/min_terminated_length": 955.0,
+      "epoch": 0.05142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24254000186920166,
+      "learning_rate": 9.610954559391704e-07,
+      "loss": 0.0,
+      "num_tokens": 5638753.0,
+      "reward": -0.2540697157382965,
+      "reward_std": 0.4600578844547272,
+      "rewards/cosine_scaled_reward/mean": -0.20515984296798706,
+      "rewards/cosine_scaled_reward/std": 0.3251590430736542,
+      "rewards/format_reward/mean": 0.15625,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1563.0,
+      "completions/mean_length": 1765.984375,
+      "completions/mean_terminated_length": 919.9375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "epoch": 0.052571428571428575,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2645930349826813,
+      "learning_rate": 9.578385041664925e-07,
+      "loss": 0.0,
+      "num_tokens": 5762944.0,
+      "reward": -0.213707834482193,
+      "reward_std": 0.38778313994407654,
+      "rewards/cosine_scaled_reward/mean": -0.2318539321422577,
+      "rewards/cosine_scaled_reward/std": 0.21436986327171326,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1993.0,
+      "completions/mean_length": 1583.40625,
+      "completions/mean_terminated_length": 986.0714721679688,
+      "completions/min_length": 436.0,
+      "completions/min_terminated_length": 436.0,
+      "epoch": 0.053714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.311797559261322,
+      "learning_rate": 9.54457320834625e-07,
+      "loss": 0.0,
+      "num_tokens": 5874682.0,
+      "reward": 0.27925533056259155,
+      "reward_std": 0.6467443704605103,
+      "rewards/cosine_scaled_reward/mean": -0.07912233471870422,
+      "rewards/cosine_scaled_reward/std": 0.4737093150615692,
+      "rewards/format_reward/mean": 0.4375,
+      "rewards/format_reward/std": 0.5,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1527.0,
+      "completions/mean_length": 1690.0625,
+      "completions/mean_terminated_length": 1006.727294921875,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "epoch": 0.054857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26644304394721985,
+      "learning_rate": 9.509529358847654e-07,
+      "loss": -0.0,
+      "num_tokens": 5993390.0,
+      "reward": 0.13692031800746918,
+      "reward_std": 0.5655145049095154,
+      "rewards/cosine_scaled_reward/mean": -0.12685233354568481,
+      "rewards/cosine_scaled_reward/std": 0.32320985198020935,
+      "rewards/format_reward/mean": 0.390625,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2021.0,
+      "completions/mean_length": 1387.140625,
+      "completions/mean_terminated_length": 804.0294189453125,
+      "completions/min_length": 300.0,
+      "completions/min_terminated_length": 300.0,
+      "epoch": 0.056,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3078882396221161,
+      "learning_rate": 9.473264167865171e-07,
+      "loss": 0.0,
+      "num_tokens": 6092231.0,
+      "reward": 0.35559189319610596,
+      "reward_std": 0.5927403569221497,
+      "rewards/cosine_scaled_reward/mean": -0.09564155340194702,
+      "rewards/cosine_scaled_reward/std": 0.4046906530857086,
+      "rewards/format_reward/mean": 0.546875,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1830.0,
+      "completions/mean_length": 1674.890625,
+      "completions/mean_terminated_length": 962.5909423828125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "epoch": 0.05714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23925544321537018,
+      "learning_rate": 9.43578868212728e-07,
+      "loss": -0.0,
+      "num_tokens": 6210240.0,
+      "reward": 0.18573230504989624,
+      "reward_std": 0.5264967083930969,
+      "rewards/cosine_scaled_reward/mean": -0.09463384002447128,
+      "rewards/cosine_scaled_reward/std": 0.4100942015647888,
+      "rewards/format_reward/mean": 0.375,
+      "rewards/format_reward/std": 0.48795005679130554,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.421875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2024.0,
+      "completions/mean_length": 1347.40625,
+      "completions/mean_terminated_length": 836.1621704101562,
+      "completions/min_length": 394.0,
+      "completions/min_terminated_length": 394.0,
+      "epoch": 0.05828571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.325811505317688,
+      "learning_rate": 9.397114317029974e-07,
+      "loss": 0.0,
+      "num_tokens": 6306682.0,
+      "reward": 0.1735648661851883,
+      "reward_std": 0.5335988998413086,
+      "rewards/cosine_scaled_reward/mean": -0.21009255945682526,
+      "rewards/cosine_scaled_reward/std": 0.2623959481716156,
+      "rewards/format_reward/mean": 0.59375,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1390.0,
+      "completions/mean_length": 1727.765625,
+      "completions/mean_terminated_length": 767.0625,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 0.05942857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27392977476119995,
+      "learning_rate": 9.357252853159505e-07,
+      "loss": 0.0,
+      "num_tokens": 6428611.0,
+      "reward": -0.16267812252044678,
+      "reward_std": 0.5682471990585327,
+      "rewards/cosine_scaled_reward/mean": -0.2219640612602234,
+      "rewards/cosine_scaled_reward/std": 0.36739134788513184,
+      "rewards/format_reward/mean": 0.28125,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1907.0,
+      "completions/mean_length": 1609.171875,
+      "completions/mean_terminated_length": 924.5999755859375,
+      "completions/min_length": 513.0,
+      "completions/min_terminated_length": 513.0,
+      "epoch": 0.060571428571428575,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28155064582824707,
+      "learning_rate": 9.316216432703916e-07,
+      "loss": -0.0,
+      "num_tokens": 6542430.0,
+      "reward": 0.0752667784690857,
+      "reward_std": 0.7118167281150818,
+      "rewards/cosine_scaled_reward/mean": -0.18892911076545715,
+      "rewards/cosine_scaled_reward/std": 0.3222156763076782,
+      "rewards/format_reward/mean": 0.453125,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1986.0,
+      "completions/mean_length": 1588.234375,
+      "completions/mean_terminated_length": 1067.166748046875,
+      "completions/min_length": 519.0,
+      "completions/min_terminated_length": 519.0,
+      "epoch": 0.061714285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2555343806743622,
+      "learning_rate": 9.274017555754407e-07,
+      "loss": 0.0,
+      "num_tokens": 6655221.0,
+      "reward": 0.6341299414634705,
+      "reward_std": 1.0656921863555908,
+      "rewards/cosine_scaled_reward/mean": 0.05143994837999344,
+      "rewards/cosine_scaled_reward/std": 0.5348308086395264,
+      "rewards/format_reward/mean": 0.53125,
+      "rewards/format_reward/std": 0.5029674172401428,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1420.0,
+      "completions/mean_length": 1549.5625,
+      "completions/mean_terminated_length": 821.0769653320312,
+      "completions/min_length": 444.0,
+      "completions/min_terminated_length": 444.0,
+      "epoch": 0.06285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30243629217147827,
+      "learning_rate": 9.230669076497687e-07,
+      "loss": -0.0,
+      "num_tokens": 6764681.0,
+      "reward": 0.13021975755691528,
+      "reward_std": 0.3984764516353607,
+      "rewards/cosine_scaled_reward/mean": -0.13801513612270355,
+      "rewards/cosine_scaled_reward/std": 0.41228073835372925,
+      "rewards/format_reward/mean": 0.40625,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.546875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2040.0,
+      "completions/mean_length": 1633.25,
+      "completions/mean_terminated_length": 1132.689697265625,
+      "completions/min_length": 543.0,
+      "completions/min_terminated_length": 543.0,
+      "epoch": 0.064,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23835402727127075,
+      "learning_rate": 9.186184199300463e-07,
+      "loss": -0.0,
+      "num_tokens": 6880169.0,
+      "reward": 0.27981996536254883,
+      "reward_std": 0.5018116235733032,
+      "rewards/cosine_scaled_reward/mean": -0.10227750986814499,
+      "rewards/cosine_scaled_reward/std": 0.481824666261673,
+      "rewards/format_reward/mean": 0.484375,
+      "rewards/format_reward/std": 0.5037065148353577,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1807.0,
+      "completions/mean_length": 1699.875,
+      "completions/mean_terminated_length": 1156.7999267578125,
+      "completions/min_length": 642.0,
+      "completions/min_terminated_length": 642.0,
+      "epoch": 0.06514285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22349494695663452,
+      "learning_rate": 9.140576474687263e-07,
+      "loss": 0.0,
+      "num_tokens": 7000529.0,
+      "reward": -0.026505012065172195,
+      "reward_std": 0.5785415172576904,
+      "rewards/cosine_scaled_reward/mean": -0.20856501162052155,
+      "rewards/cosine_scaled_reward/std": 0.2749907374382019,
+      "rewards/format_reward/mean": 0.390625,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2044.0,
+      "completions/mean_length": 1457.875,
+      "completions/mean_terminated_length": 1054.105224609375,
+      "completions/min_length": 447.0,
+      "completions/min_terminated_length": 447.0,
+      "epoch": 0.06628571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.261942595243454,
+      "learning_rate": 9.093859795212817e-07,
+      "loss": 0.0,
+      "num_tokens": 7103929.0,
+      "reward": 0.5745843648910522,
+      "reward_std": 0.8671218156814575,
+      "rewards/cosine_scaled_reward/mean": -0.03302033245563507,
+      "rewards/cosine_scaled_reward/std": 0.45529407262802124,
+      "rewards/format_reward/mean": 0.640625,
+      "rewards/format_reward/std": 0.4836103618144989,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.484375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2010.0,
+      "completions/mean_length": 1590.0625,
+      "completions/mean_terminated_length": 1159.8787841796875,
+      "completions/min_length": 591.0,
+      "completions/min_terminated_length": 591.0,
+      "epoch": 0.06742857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24828943610191345,
+      "learning_rate": 9.046048391230247e-07,
+      "loss": -0.0,
+      "num_tokens": 7216157.0,
+      "reward": 0.3377103805541992,
+      "reward_std": 0.5543617010116577,
+      "rewards/cosine_scaled_reward/mean": -0.1045822948217392,
+      "rewards/cosine_scaled_reward/std": 0.39040952920913696,
+      "rewards/format_reward/mean": 0.546875,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1976.0,
+      "completions/mean_length": 1622.84375,
+      "completions/mean_terminated_length": 1076.21435546875,
+      "completions/min_length": 555.0,
+      "completions/min_terminated_length": 555.0,
+      "epoch": 0.06857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2752656936645508,
+      "learning_rate": 8.997156826556369e-07,
+      "loss": -0.0,
+      "num_tokens": 7330907.0,
+      "reward": 0.11114693433046341,
+      "reward_std": 0.6926254034042358,
+      "rewards/cosine_scaled_reward/mean": -0.1788015365600586,
+      "rewards/cosine_scaled_reward/std": 0.39409172534942627,
+      "rewards/format_reward/mean": 0.46875,
+      "rewards/format_reward/std": 0.5029674172401428,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1984.0,
+      "completions/mean_length": 1708.859375,
+      "completions/mean_terminated_length": 1014.4285888671875,
+      "completions/min_length": 411.0,
+      "completions/min_terminated_length": 411.0,
+      "epoch": 0.06971428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22669929265975952,
+      "learning_rate": 8.9471999940354e-07,
+      "loss": -0.0,
+      "num_tokens": 7451794.0,
+      "reward": 0.2345120906829834,
+      "reward_std": 0.6293160319328308,
+      "rewards/cosine_scaled_reward/mean": -0.1093064472079277,
+      "rewards/cosine_scaled_reward/std": 0.29189831018447876,
+      "rewards/format_reward/mean": 0.453125,
+      "rewards/format_reward/std": 0.501733124256134,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2008.0,
+      "completions/mean_length": 1281.53125,
+      "completions/mean_terminated_length": 1004.2978515625,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "epoch": 0.07085714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25438693165779114,
+      "learning_rate": 8.896193111002475e-07,
+      "loss": 0.0,
+      "num_tokens": 7544044.0,
+      "reward": 0.9180847406387329,
+      "reward_std": 0.6390912532806396,
+      "rewards/cosine_scaled_reward/mean": 0.06841734796762466,
+      "rewards/cosine_scaled_reward/std": 0.48315128684043884,
+      "rewards/format_reward/mean": 0.78125,
+      "rewards/format_reward/std": 0.4166666865348816,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.359375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1678.0,
+      "completions/mean_length": 1310.46875,
+      "completions/mean_terminated_length": 896.731689453125,
+      "completions/min_length": 295.0,
+      "completions/min_terminated_length": 295.0,
+      "epoch": 0.072,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28795576095581055,
+      "learning_rate": 8.844151714648274e-07,
+      "loss": -0.0,
+      "num_tokens": 7638170.0,
+      "reward": 0.6488770246505737,
+      "reward_std": 0.7876260876655579,
+      "rewards/cosine_scaled_reward/mean": -0.019311510026454926,
+      "rewards/cosine_scaled_reward/std": 0.4736698865890503,
+      "rewards/format_reward/mean": 0.6875,
+      "rewards/format_reward/std": 0.467176616191864,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1969.0,
+      "completions/mean_length": 1307.625,
+      "completions/mean_terminated_length": 1039.8297119140625,
+      "completions/min_length": 376.0,
+      "completions/min_terminated_length": 376.0,
+      "epoch": 0.07314285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25637197494506836,
+      "learning_rate": 8.791091657286267e-07,
+      "loss": -0.0,
+      "num_tokens": 7732810.0,
+      "reward": 0.8280279636383057,
+      "reward_std": 0.6804471015930176,
+      "rewards/cosine_scaled_reward/mean": 0.015576483681797981,
+      "rewards/cosine_scaled_reward/std": 0.44819310307502747,
+      "rewards/format_reward/mean": 0.796875,
+      "rewards/format_reward/std": 0.40550529956817627,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.359375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1846.0,
+      "completions/mean_length": 1322.125,
+      "completions/mean_terminated_length": 914.9268188476562,
+      "completions/min_length": 297.0,
+      "completions/min_terminated_length": 297.0,
+      "epoch": 0.07428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2944399118423462,
+      "learning_rate": 8.737029101523929e-07,
+      "loss": -0.0,
+      "num_tokens": 7828130.0,
+      "reward": 0.15610456466674805,
+      "reward_std": 0.4606686234474182,
+      "rewards/cosine_scaled_reward/mean": -0.24226020276546478,
+      "rewards/cosine_scaled_reward/std": 0.33131492137908936,
+      "rewards/format_reward/mean": 0.640625,
+      "rewards/format_reward/std": 0.4836103618144989,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1803.0,
+      "completions/mean_length": 1020.21875,
+      "completions/mean_terminated_length": 806.9057006835938,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 0.07542857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.32644009590148926,
+      "learning_rate": 8.681980515339463e-07,
+      "loss": 0.0,
+      "num_tokens": 7903656.0,
+      "reward": 0.7972471714019775,
+      "reward_std": 0.7674820423126221,
+      "rewards/cosine_scaled_reward/mean": -0.031063925474882126,
+      "rewards/cosine_scaled_reward/std": 0.5106223225593567,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1910.0,
+      "completions/mean_length": 1750.859375,
+      "completions/mean_terminated_length": 1142.4285888671875,
+      "completions/min_length": 585.0,
+      "completions/min_terminated_length": 585.0,
+      "epoch": 0.07657142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2270829975605011,
+      "learning_rate": 8.625962667065487e-07,
+      "loss": 0.0,
+      "num_tokens": 8026447.0,
+      "reward": -0.1400720775127411,
+      "reward_std": 0.3325888514518738,
+      "rewards/cosine_scaled_reward/mean": -0.24972353875637054,
+      "rewards/cosine_scaled_reward/std": 0.16404789686203003,
+      "rewards/format_reward/mean": 0.359375,
+      "rewards/format_reward/std": 0.4836103618144989,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1424.0,
+      "completions/mean_length": 769.546875,
+      "completions/mean_terminated_length": 637.2930908203125,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 0.07771428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.37025144696235657,
+      "learning_rate": 8.568992620281243e-07,
+      "loss": -0.0,
+      "num_tokens": 8084954.0,
+      "reward": 0.9792699813842773,
+      "reward_std": 0.804767370223999,
+      "rewards/cosine_scaled_reward/mean": 0.03651002421975136,
+      "rewards/cosine_scaled_reward/std": 0.46041443943977356,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1701.0,
+      "completions/mean_length": 1086.234375,
+      "completions/mean_terminated_length": 886.6226806640625,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 0.07885714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3763800263404846,
+      "learning_rate": 8.511087728614862e-07,
+      "loss": 0.0,
+      "num_tokens": 8164817.0,
+      "reward": 0.35803771018981934,
+      "reward_std": 0.5702667236328125,
+      "rewards/cosine_scaled_reward/mean": -0.24285613000392914,
+      "rewards/cosine_scaled_reward/std": 0.3019825220108032,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1898.0,
+      "completions/mean_length": 1463.375,
+      "completions/mean_terminated_length": 1112.5999755859375,
+      "completions/min_length": 503.0,
+      "completions/min_terminated_length": 503.0,
+      "epoch": 0.08,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24232418835163116,
+      "learning_rate": 8.452265630457282e-07,
+      "loss": -0.0,
+      "num_tokens": 8269929.0,
+      "reward": 0.3703588843345642,
+      "reward_std": 0.7288752794265747,
+      "rewards/cosine_scaled_reward/mean": -0.1351330280303955,
+      "rewards/cosine_scaled_reward/std": 0.3751916289329529,
+      "rewards/format_reward/mean": 0.640625,
+      "rewards/format_reward/std": 0.4836103618144989,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1991.0,
+      "completions/mean_length": 1409.859375,
+      "completions/mean_terminated_length": 973.2368774414062,
+      "completions/min_length": 404.0,
+      "completions/min_terminated_length": 404.0,
+      "epoch": 0.08114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.300010621547699,
+      "learning_rate": 8.392544243589427e-07,
+      "loss": 0.0,
+      "num_tokens": 8370880.0,
+      "reward": 0.5196826457977295,
+      "reward_std": 0.7097917795181274,
+      "rewards/cosine_scaled_reward/mean": -0.044846177101135254,
+      "rewards/cosine_scaled_reward/std": 0.508389949798584,
+      "rewards/format_reward/mean": 0.609375,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1801.0,
+      "completions/mean_length": 1228.046875,
+      "completions/mean_terminated_length": 931.4680786132812,
+      "completions/min_length": 331.0,
+      "completions/min_terminated_length": 331.0,
+      "epoch": 0.08228571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30454304814338684,
+      "learning_rate": 8.331941759724268e-07,
+      "loss": -0.0,
+      "num_tokens": 8459827.0,
+      "reward": 0.41365131735801697,
+      "reward_std": 0.5005639791488647,
+      "rewards/cosine_scaled_reward/mean": -0.1759868562221527,
+      "rewards/cosine_scaled_reward/std": 0.19868774712085724,
+      "rewards/format_reward/mean": 0.765625,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1719.0,
+      "completions/mean_length": 1513.28125,
+      "completions/mean_terminated_length": 1192.4500732421875,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "epoch": 0.08342857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27848970890045166,
+      "learning_rate": 8.270476638965461e-07,
+      "loss": -0.0,
+      "num_tokens": 8567405.0,
+      "reward": 0.09570223093032837,
+      "reward_std": 0.5445049405097961,
+      "rewards/cosine_scaled_reward/mean": -0.2802739143371582,
+      "rewards/cosine_scaled_reward/std": 0.25603488087654114,
+      "rewards/format_reward/mean": 0.65625,
+      "rewards/format_reward/std": 0.4787135720252991,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1888.0,
+      "completions/mean_length": 1240.125,
+      "completions/mean_terminated_length": 924.0,
+      "completions/min_length": 530.0,
+      "completions/min_terminated_length": 530.0,
+      "epoch": 0.08457142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2789021134376526,
+      "learning_rate": 8.208167604184217e-07,
+      "loss": 0.0,
+      "num_tokens": 8656701.0,
+      "reward": 0.7823752760887146,
+      "reward_std": 0.6479132175445557,
+      "rewards/cosine_scaled_reward/mean": 0.031812600791454315,
+      "rewards/cosine_scaled_reward/std": 0.5397623181343079,
+      "rewards/format_reward/mean": 0.71875,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2036.0,
+      "completions/mean_length": 1455.953125,
+      "completions/mean_terminated_length": 1186.8409423828125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "epoch": 0.08571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22443196177482605,
+      "learning_rate": 8.145033635316128e-07,
+      "loss": 0.0,
+      "num_tokens": 8760842.0,
+      "reward": 0.8040015697479248,
+      "reward_std": 0.5675323009490967,
+      "rewards/cosine_scaled_reward/mean": 0.027000809088349342,
+      "rewards/cosine_scaled_reward/std": 0.5096040964126587,
+      "rewards/format_reward/mean": 0.75,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1720.0,
+      "completions/mean_length": 1177.859375,
+      "completions/mean_terminated_length": 863.1276245117188,
+      "completions/min_length": 372.0,
+      "completions/min_terminated_length": 372.0,
+      "epoch": 0.08685714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.32647648453712463,
+      "learning_rate": 8.081093963579707e-07,
+      "loss": 0.0,
+      "num_tokens": 8846625.0,
+      "reward": 0.310506671667099,
+      "reward_std": 0.5110941529273987,
+      "rewards/cosine_scaled_reward/mean": -0.2119341641664505,
+      "rewards/cosine_scaled_reward/std": 0.24737994372844696,
+      "rewards/format_reward/mean": 0.734375,
+      "rewards/format_reward/std": 0.44515693187713623,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1787.0,
+      "completions/mean_length": 1263.4375,
+      "completions/mean_terminated_length": 1043.760009765625,
+      "completions/min_length": 501.0,
+      "completions/min_terminated_length": 501.0,
+      "epoch": 0.088,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2545543611049652,
+      "learning_rate": 8.01636806561836e-07,
+      "loss": -0.0,
+      "num_tokens": 8939061.0,
+      "reward": 0.5484907031059265,
+      "reward_std": 0.48998576402664185,
+      "rewards/cosine_scaled_reward/mean": -0.13200464844703674,
+      "rewards/cosine_scaled_reward/std": 0.3430649936199188,
+      "rewards/format_reward/mean": 0.8125,
+      "rewards/format_reward/std": 0.39339789748191833,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1991.0,
+      "completions/mean_length": 1460.78125,
+      "completions/mean_terminated_length": 1059.0,
+      "completions/min_length": 430.0,
+      "completions/min_terminated_length": 430.0,
+      "epoch": 0.08914285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2583931088447571,
+      "learning_rate": 7.950875657567621e-07,
+      "loss": 0.0,
+      "num_tokens": 9043271.0,
+      "reward": 0.6075442433357239,
+      "reward_std": 0.6895643472671509,
+      "rewards/cosine_scaled_reward/mean": -0.0009153857827186584,
+      "rewards/cosine_scaled_reward/std": 0.48922818899154663,
+      "rewards/format_reward/mean": 0.609375,
+      "rewards/format_reward/std": 0.4917473793029785,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1956.0,
+      "completions/mean_length": 1054.875,
+      "completions/mean_terminated_length": 892.3635864257812,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 0.09028571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29089078307151794,
+      "learning_rate": 7.884636689049422e-07,
+      "loss": 0.0,
+      "num_tokens": 9120879.0,
+      "reward": 0.6885831356048584,
+      "reward_std": 0.508629322052002,
+      "rewards/cosine_scaled_reward/mean": -0.09320840239524841,
+      "rewards/cosine_scaled_reward/std": 0.38835227489471436,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2037.0,
+      "completions/mean_length": 1399.046875,
+      "completions/mean_terminated_length": 1145.1087646484375,
+      "completions/min_length": 484.0,
+      "completions/min_terminated_length": 484.0,
+      "epoch": 0.09142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27458345890045166,
+      "learning_rate": 7.817671337095244e-07,
+      "loss": 0.0,
+      "num_tokens": 9220810.0,
+      "reward": 0.5549384355545044,
+      "reward_std": 0.7092134952545166,
+      "rewards/cosine_scaled_reward/mean": -0.09753081202507019,
+      "rewards/cosine_scaled_reward/std": 0.4125780463218689,
+      "rewards/format_reward/mean": 0.75,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1833.0,
+      "completions/mean_length": 1084.984375,
+      "completions/mean_terminated_length": 906.6481323242188,
+      "completions/min_length": 274.0,
+      "completions/min_terminated_length": 274.0,
+      "epoch": 0.09257142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.37247684597969055,
+      "learning_rate": 7.75e-07,
+      "loss": -0.0,
+      "num_tokens": 9301521.0,
+      "reward": 0.5357480049133301,
+      "reward_std": 0.5661624670028687,
+      "rewards/cosine_scaled_reward/mean": -0.18525099754333496,
+      "rewards/cosine_scaled_reward/std": 0.3385297954082489,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2046.0,
+      "completions/mean_length": 1260.921875,
+      "completions/mean_terminated_length": 998.5625,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "epoch": 0.09371428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27329322695732117,
+      "learning_rate": 7.681643291108517e-07,
+      "loss": -0.0,
+      "num_tokens": 9392548.0,
+      "reward": 0.9478914737701416,
+      "reward_std": 0.4313860237598419,
+      "rewards/cosine_scaled_reward/mean": 0.09894578158855438,
+      "rewards/cosine_scaled_reward/std": 0.5477120876312256,
+      "rewards/format_reward/mean": 0.75,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2039.0,
+      "completions/mean_length": 1309.671875,
+      "completions/mean_terminated_length": 922.9285888671875,
+      "completions/min_length": 303.0,
+      "completions/min_terminated_length": 303.0,
+      "epoch": 0.09485714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3202998638153076,
+      "learning_rate": 7.612622032536507e-07,
+      "loss": -0.0,
+      "num_tokens": 9487455.0,
+      "reward": 0.5201998949050903,
+      "reward_std": 0.6858996152877808,
+      "rewards/cosine_scaled_reward/mean": -0.09927503764629364,
+      "rewards/cosine_scaled_reward/std": 0.37909674644470215,
+      "rewards/format_reward/mean": 0.71875,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1685.0,
+      "completions/mean_length": 1185.703125,
+      "completions/mean_terminated_length": 965.9019775390625,
+      "completions/min_length": 390.0,
+      "completions/min_terminated_length": 390.0,
+      "epoch": 0.096,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29646041989326477,
+      "learning_rate": 7.54295724882796e-07,
+      "loss": -0.0,
+      "num_tokens": 9574036.0,
+      "reward": 0.6779025793075562,
+      "reward_std": 0.557724118232727,
+      "rewards/cosine_scaled_reward/mean": -0.09073619544506073,
+      "rewards/cosine_scaled_reward/std": 0.3855368196964264,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2021.0,
+      "completions/mean_length": 1297.828125,
+      "completions/mean_terminated_length": 1158.907470703125,
+      "completions/min_length": 601.0,
+      "completions/min_terminated_length": 601.0,
+      "epoch": 0.09714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21307455003261566,
+      "learning_rate": 7.472670160550848e-07,
+      "loss": 0.0,
+      "num_tokens": 9667417.0,
+      "reward": 0.5093189477920532,
+      "reward_std": 0.6006681323051453,
+      "rewards/cosine_scaled_reward/mean": -0.1672155261039734,
+      "rewards/cosine_scaled_reward/std": 0.34896284341812134,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1859.0,
+      "completions/mean_length": 1348.90625,
+      "completions/mean_terminated_length": 1096.04248046875,
+      "completions/min_length": 501.0,
+      "completions/min_terminated_length": 501.0,
+      "epoch": 0.09828571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2883393168449402,
+      "learning_rate": 7.401782177833147e-07,
+      "loss": -0.0,
+      "num_tokens": 9764603.0,
+      "reward": 0.8025823831558228,
+      "reward_std": 0.547119677066803,
+      "rewards/cosine_scaled_reward/mean": 0.01847870647907257,
+      "rewards/cosine_scaled_reward/std": 0.4346420168876648,
+      "rewards/format_reward/mean": 0.765625,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1782.0,
+      "completions/mean_length": 1086.96875,
+      "completions/mean_terminated_length": 909.0,
+      "completions/min_length": 350.0,
+      "completions/min_terminated_length": 350.0,
+      "epoch": 0.09942857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31888866424560547,
+      "learning_rate": 7.330314893841101e-07,
+      "loss": -0.0,
+      "num_tokens": 9844289.0,
+      "reward": 0.5533354878425598,
+      "reward_std": 0.5319498777389526,
+      "rewards/cosine_scaled_reward/mean": -0.1530197560787201,
+      "rewards/cosine_scaled_reward/std": 0.2434682846069336,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1984.0,
+      "completions/mean_length": 954.921875,
+      "completions/mean_terminated_length": 919.6612548828125,
+      "completions/min_length": 114.0,
+      "completions/min_terminated_length": 114.0,
+      "epoch": 0.10057142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3025936484336853,
+      "learning_rate": 7.258290078201731e-07,
+      "loss": -0.0,
+      "num_tokens": 9915916.0,
+      "reward": 1.2692296504974365,
+      "reward_std": 0.5115163326263428,
+      "rewards/cosine_scaled_reward/mean": 0.13461479544639587,
+      "rewards/cosine_scaled_reward/std": 0.506001353263855,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1926.0,
+      "completions/mean_length": 1351.8125,
+      "completions/mean_terminated_length": 1174.35302734375,
+      "completions/min_length": 650.0,
+      "completions/min_terminated_length": 650.0,
+      "epoch": 0.10171428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23423585295677185,
+      "learning_rate": 7.185729670371604e-07,
+      "loss": -0.0,
+      "num_tokens": 10013432.0,
+      "reward": 0.724889874458313,
+      "reward_std": 0.7425336837768555,
+      "rewards/cosine_scaled_reward/mean": -0.0828675627708435,
+      "rewards/cosine_scaled_reward/std": 0.3893774449825287,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1906.0,
+      "completions/mean_length": 1153.28125,
+      "completions/mean_terminated_length": 1025.46435546875,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "epoch": 0.10285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3860023021697998,
+      "learning_rate": 7.11265577295385e-07,
+      "loss": -0.0,
+      "num_tokens": 10097242.0,
+      "reward": 0.5000253915786743,
+      "reward_std": 0.5103108286857605,
+      "rewards/cosine_scaled_reward/mean": -0.18748730421066284,
+      "rewards/cosine_scaled_reward/std": 0.2787182629108429,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.328125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2032.0,
+      "completions/mean_length": 1455.484375,
+      "completions/mean_terminated_length": 1166.1163330078125,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "epoch": 0.104,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2551063895225525,
+      "learning_rate": 7.039090644965509e-07,
+      "loss": 0.0,
+      "num_tokens": 10200961.0,
+      "reward": 0.4053259789943695,
+      "reward_std": 0.663999617099762,
+      "rewards/cosine_scaled_reward/mean": -0.18796202540397644,
+      "rewards/cosine_scaled_reward/std": 0.35777655243873596,
+      "rewards/format_reward/mean": 0.78125,
+      "rewards/format_reward/std": 0.4166666865348816,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2004.0,
+      "completions/mean_length": 1176.953125,
+      "completions/mean_terminated_length": 1015.6481323242188,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "epoch": 0.10514285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27449366450309753,
+      "learning_rate": 6.965056695057204e-07,
+      "loss": -0.0,
+      "num_tokens": 10286278.0,
+      "reward": 0.5743436217308044,
+      "reward_std": 0.6229422092437744,
+      "rewards/cosine_scaled_reward/mean": -0.15032817423343658,
+      "rewards/cosine_scaled_reward/std": 0.2899566888809204,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2027.0,
+      "completions/mean_length": 1434.875,
+      "completions/mean_terminated_length": 1156.181884765625,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "epoch": 0.10628571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2839376926422119,
+      "learning_rate": 6.890576474687263e-07,
+      "loss": 0.0,
+      "num_tokens": 10389454.0,
+      "reward": 0.30658647418022156,
+      "reward_std": 0.5343226194381714,
+      "rewards/cosine_scaled_reward/mean": -0.22951926290988922,
+      "rewards/cosine_scaled_reward/std": 0.2324177473783493,
+      "rewards/format_reward/mean": 0.765625,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1684.0,
+      "completions/mean_length": 1242.390625,
+      "completions/mean_terminated_length": 927.1522216796875,
+      "completions/min_length": 508.0,
+      "completions/min_terminated_length": 508.0,
+      "epoch": 0.10742857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2985072433948517,
+      "learning_rate": 6.815672671252315e-07,
+      "loss": 0.0,
+      "num_tokens": 10478735.0,
+      "reward": 0.6593698263168335,
+      "reward_std": 0.5845412015914917,
+      "rewards/cosine_scaled_reward/mean": -0.02969011664390564,
+      "rewards/cosine_scaled_reward/std": 0.47056320309638977,
+      "rewards/format_reward/mean": 0.71875,
+      "rewards/format_reward/std": 0.4531635046005249,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1931.0,
+      "completions/mean_length": 1203.265625,
+      "completions/mean_terminated_length": 1082.58935546875,
+      "completions/min_length": 573.0,
+      "completions/min_terminated_length": 573.0,
+      "epoch": 0.10857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2689598798751831,
+      "learning_rate": 6.740368101176495e-07,
+      "loss": 0.0,
+      "num_tokens": 10566272.0,
+      "reward": 0.4301251173019409,
+      "reward_std": 0.4795047640800476,
+      "rewards/cosine_scaled_reward/mean": -0.22243742644786835,
+      "rewards/cosine_scaled_reward/std": 0.2575407326221466,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1827.0,
+      "completions/mean_length": 1205.5625,
+      "completions/mean_terminated_length": 990.8235473632812,
+      "completions/min_length": 441.0,
+      "completions/min_terminated_length": 441.0,
+      "epoch": 0.10971428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30502915382385254,
+      "learning_rate": 6.664685702961344e-07,
+      "loss": -0.0,
+      "num_tokens": 10654564.0,
+      "reward": 0.896080493927002,
+      "reward_std": 0.6987663507461548,
+      "rewards/cosine_scaled_reward/mean": 0.02616523765027523,
+      "rewards/cosine_scaled_reward/std": 0.460237056016922,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1777.0,
+      "completions/mean_length": 1170.390625,
+      "completions/mean_terminated_length": 988.2453002929688,
+      "completions/min_length": 430.0,
+      "completions/min_terminated_length": 430.0,
+      "epoch": 0.11085714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3103901743888855,
+      "learning_rate": 6.588648530198504e-07,
+      "loss": -0.0,
+      "num_tokens": 10739733.0,
+      "reward": 0.6633297204971313,
+      "reward_std": 0.609075665473938,
+      "rewards/cosine_scaled_reward/mean": -0.12927262485027313,
+      "rewards/cosine_scaled_reward/std": 0.4114542305469513,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1817.0,
+      "completions/mean_length": 1136.5625,
+      "completions/mean_terminated_length": 947.396240234375,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "epoch": 0.112,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2510873079299927,
+      "learning_rate": 6.512279744547392e-07,
+      "loss": 0.0,
+      "num_tokens": 10823537.0,
+      "reward": 0.6613268256187439,
+      "reward_std": 0.4785424768924713,
+      "rewards/cosine_scaled_reward/mean": -0.09902409464120865,
+      "rewards/cosine_scaled_reward/std": 0.4345317482948303,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2031.0,
+      "completions/mean_length": 1171.8125,
+      "completions/mean_terminated_length": 1081.17236328125,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 0.11314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.281054824590683,
+      "learning_rate": 6.435602608679916e-07,
+      "loss": -0.0,
+      "num_tokens": 10909701.0,
+      "reward": 1.0416245460510254,
+      "reward_std": 0.6949809789657593,
+      "rewards/cosine_scaled_reward/mean": 0.0520622618496418,
+      "rewards/cosine_scaled_reward/std": 0.508481502532959,
+      "rewards/format_reward/mean": 0.9375,
+      "rewards/format_reward/std": 0.24397502839565277,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1935.0,
+      "completions/mean_length": 1120.8125,
+      "completions/mean_terminated_length": 1024.8966064453125,
+      "completions/min_length": 410.0,
+      "completions/min_terminated_length": 410.0,
+      "epoch": 0.11428571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2910788655281067,
+      "learning_rate": 6.358640479194451e-07,
+      "loss": 0.0,
+      "num_tokens": 10991145.0,
+      "reward": 1.2036188840866089,
+      "reward_std": 0.8533884286880493,
+      "rewards/cosine_scaled_reward/mean": 0.14087192714214325,
+      "rewards/cosine_scaled_reward/std": 0.5375887751579285,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1978.0,
+      "completions/mean_length": 1076.953125,
+      "completions/mean_terminated_length": 1029.1966552734375,
+      "completions/min_length": 423.0,
+      "completions/min_terminated_length": 423.0,
+      "epoch": 0.11542857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33955609798431396,
+      "learning_rate": 6.281416799501187e-07,
+      "loss": 0.0,
+      "num_tokens": 11071502.0,
+      "reward": 0.7810705900192261,
+      "reward_std": 0.5973731279373169,
+      "rewards/cosine_scaled_reward/mean": -0.10165221989154816,
+      "rewards/cosine_scaled_reward/std": 0.4130260646343231,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1959.0,
+      "completions/mean_length": 1092.078125,
+      "completions/mean_terminated_length": 935.654541015625,
+      "completions/min_length": 370.0,
+      "completions/min_terminated_length": 370.0,
+      "epoch": 0.11657142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34537607431411743,
+      "learning_rate": 6.203955092681039e-07,
+      "loss": 0.0,
+      "num_tokens": 11151547.0,
+      "reward": 0.6441041231155396,
+      "reward_std": 0.53089839220047,
+      "rewards/cosine_scaled_reward/mean": -0.10763543844223022,
+      "rewards/cosine_scaled_reward/std": 0.39948928356170654,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2039.0,
+      "completions/mean_length": 1120.625,
+      "completions/mean_terminated_length": 1006.7368774414062,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 0.11771428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.343980997800827,
+      "learning_rate": 6.126278954320294e-07,
+      "loss": 0.0,
+      "num_tokens": 11233619.0,
+      "reward": 0.6925251483917236,
+      "reward_std": 0.5938367247581482,
+      "rewards/cosine_scaled_reward/mean": -0.13029994070529938,
+      "rewards/cosine_scaled_reward/std": 0.37749138474464417,
+      "rewards/format_reward/mean": 0.953125,
+      "rewards/format_reward/std": 0.21304203569889069,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1913.0,
+      "completions/mean_length": 1120.359375,
+      "completions/mean_terminated_length": 948.5740966796875,
+      "completions/min_length": 345.0,
+      "completions/min_terminated_length": 345.0,
+      "epoch": 0.11885714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30854102969169617,
+      "learning_rate": 6.048412045323164e-07,
+      "loss": -0.0,
+      "num_tokens": 11315786.0,
+      "reward": 0.560060977935791,
+      "reward_std": 0.5216183662414551,
+      "rewards/cosine_scaled_reward/mean": -0.1418444812297821,
+      "rewards/cosine_scaled_reward/std": 0.33836889266967773,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1897.0,
+      "completions/mean_length": 1158.421875,
+      "completions/mean_terminated_length": 953.1346435546875,
+      "completions/min_length": 503.0,
+      "completions/min_terminated_length": 503.0,
+      "epoch": 0.12,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29531243443489075,
+      "learning_rate": 5.97037808470444e-07,
+      "loss": -0.0,
+      "num_tokens": 11401213.0,
+      "reward": 1.0410652160644531,
+      "reward_std": 0.7858219742774963,
+      "rewards/cosine_scaled_reward/mean": 0.09084508568048477,
+      "rewards/cosine_scaled_reward/std": 0.5061684250831604,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1867.0,
+      "completions/mean_length": 1045.859375,
+      "completions/mean_terminated_length": 837.867919921875,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "epoch": 0.12114285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26259294152259827,
+      "learning_rate": 5.892200842364462e-07,
+      "loss": -0.0,
+      "num_tokens": 11478980.0,
+      "reward": 1.0545225143432617,
+      "reward_std": 0.7633667588233948,
+      "rewards/cosine_scaled_reward/mean": 0.07413630187511444,
+      "rewards/cosine_scaled_reward/std": 0.48842984437942505,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1999.0,
+      "completions/mean_length": 1101.234375,
+      "completions/mean_terminated_length": 946.30908203125,
+      "completions/min_length": 346.0,
+      "completions/min_terminated_length": 346.0,
+      "epoch": 0.12228571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3363504409790039,
+      "learning_rate": 5.813904131848564e-07,
+      "loss": 0.0,
+      "num_tokens": 11560611.0,
+      "reward": 0.648673415184021,
+      "reward_std": 0.6051540970802307,
+      "rewards/cosine_scaled_reward/mean": -0.11316327750682831,
+      "rewards/cosine_scaled_reward/std": 0.37149766087532043,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1857.0,
+      "completions/mean_length": 1225.28125,
+      "completions/mean_terminated_length": 1054.5283203125,
+      "completions/min_length": 515.0,
+      "completions/min_terminated_length": 515.0,
+      "epoch": 0.12342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2867675721645355,
+      "learning_rate": 5.735511803093248e-07,
+      "loss": 0.0,
+      "num_tokens": 11649389.0,
+      "reward": 0.560509204864502,
+      "reward_std": 0.6691359877586365,
+      "rewards/cosine_scaled_reward/mean": -0.14943289756774902,
+      "rewards/cosine_scaled_reward/std": 0.4461749494075775,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2001.0,
+      "completions/mean_length": 1227.203125,
+      "completions/mean_terminated_length": 1056.84912109375,
+      "completions/min_length": 513.0,
+      "completions/min_terminated_length": 513.0,
+      "epoch": 0.12457142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2772690951824188,
+      "learning_rate": 5.657047735161255e-07,
+      "loss": -0.0,
+      "num_tokens": 11739178.0,
+      "reward": 0.6980891227722168,
+      "reward_std": 0.624833345413208,
+      "rewards/cosine_scaled_reward/mean": -0.0650179386138916,
+      "rewards/cosine_scaled_reward/std": 0.41062912344932556,
+      "rewards/format_reward/mean": 0.828125,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1727.0,
+      "completions/mean_length": 1145.0,
+      "completions/mean_terminated_length": 914.8235473632812,
+      "completions/min_length": 414.0,
+      "completions/min_terminated_length": 414.0,
+      "epoch": 0.12571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3468596637248993,
+      "learning_rate": 5.578535828967777e-07,
+      "loss": -0.0,
+      "num_tokens": 11823234.0,
+      "reward": 0.6972323656082153,
+      "reward_std": 0.5477026104927063,
+      "rewards/cosine_scaled_reward/mean": -0.08888379484415054,
+      "rewards/cosine_scaled_reward/std": 0.3565239906311035,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1969.0,
+      "completions/max_terminated_length": 1969.0,
+      "completions/mean_length": 977.046875,
+      "completions/mean_terminated_length": 977.046875,
+      "completions/min_length": 332.0,
+      "completions/min_terminated_length": 332.0,
+      "epoch": 0.12685714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3180137574672699,
+      "learning_rate": 5.5e-07,
+      "loss": 0.0,
+      "num_tokens": 11895885.0,
+      "reward": 0.8744360208511353,
+      "reward_std": 0.5815237164497375,
+      "rewards/cosine_scaled_reward/mean": -0.06278196722269058,
+      "rewards/cosine_scaled_reward/std": 0.37791064381599426,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1954.0,
+      "completions/mean_length": 1269.421875,
+      "completions/mean_terminated_length": 1089.75,
+      "completions/min_length": 605.0,
+      "completions/min_terminated_length": 605.0,
+      "epoch": 0.128,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2817465364933014,
+      "learning_rate": 5.421464171032224e-07,
+      "loss": -0.0,
+      "num_tokens": 11988224.0,
+      "reward": 0.9151681065559387,
+      "reward_std": 0.594943642616272,
+      "rewards/cosine_scaled_reward/mean": 0.02789657562971115,
+      "rewards/cosine_scaled_reward/std": 0.4965399205684662,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1910.0,
+      "completions/max_terminated_length": 1910.0,
+      "completions/mean_length": 934.578125,
+      "completions/mean_terminated_length": 934.578125,
+      "completions/min_length": 326.0,
+      "completions/min_terminated_length": 326.0,
+      "epoch": 0.12914285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3341560959815979,
+      "learning_rate": 5.342952264838747e-07,
+      "loss": -0.0,
+      "num_tokens": 12058333.0,
+      "reward": 1.0256879329681396,
+      "reward_std": 0.717230498790741,
+      "rewards/cosine_scaled_reward/mean": 0.02065650373697281,
+      "rewards/cosine_scaled_reward/std": 0.4963410794734955,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1911.0,
+      "completions/mean_length": 1055.21875,
+      "completions/mean_terminated_length": 971.0847778320312,
+      "completions/min_length": 124.0,
+      "completions/min_terminated_length": 124.0,
+      "epoch": 0.13028571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3800676763057709,
+      "learning_rate": 5.264488196906752e-07,
+      "loss": -0.0,
+      "num_tokens": 12135715.0,
+      "reward": 0.649993896484375,
+      "reward_std": 0.5865596532821655,
+      "rewards/cosine_scaled_reward/mean": -0.1750030517578125,
+      "rewards/cosine_scaled_reward/std": 0.3388007879257202,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1981.0,
+      "completions/mean_length": 1169.671875,
+      "completions/mean_terminated_length": 987.3773803710938,
+      "completions/min_length": 324.0,
+      "completions/min_terminated_length": 324.0,
+      "epoch": 0.13142857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3112519085407257,
+      "learning_rate": 5.186095868151436e-07,
+      "loss": 0.0,
+      "num_tokens": 12221790.0,
+      "reward": 0.7184536457061768,
+      "reward_std": 0.44992831349372864,
+      "rewards/cosine_scaled_reward/mean": -0.06264819949865341,
+      "rewards/cosine_scaled_reward/std": 0.44565486907958984,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1936.0,
+      "completions/mean_length": 1224.890625,
+      "completions/mean_terminated_length": 1072.4630126953125,
+      "completions/min_length": 393.0,
+      "completions/min_terminated_length": 393.0,
+      "epoch": 0.13257142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2884223461151123,
+      "learning_rate": 5.107799157635538e-07,
+      "loss": 0.0,
+      "num_tokens": 12311567.0,
+      "reward": 0.8372049927711487,
+      "reward_std": 0.608986496925354,
+      "rewards/cosine_scaled_reward/mean": -0.026710007339715958,
+      "rewards/cosine_scaled_reward/std": 0.4437602162361145,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1927.0,
+      "completions/mean_length": 1078.65625,
+      "completions/mean_terminated_length": 1030.9835205078125,
+      "completions/min_length": 494.0,
+      "completions/min_terminated_length": 494.0,
+      "epoch": 0.1337142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3016076385974884,
+      "learning_rate": 5.02962191529556e-07,
+      "loss": -0.0,
+      "num_tokens": 12391625.0,
+      "reward": 0.8182538747787476,
+      "reward_std": 0.6463132500648499,
+      "rewards/cosine_scaled_reward/mean": -0.09087307006120682,
+      "rewards/cosine_scaled_reward/std": 0.3895137310028076,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1946.0,
+      "completions/mean_length": 1226.046875,
+      "completions/mean_terminated_length": 952.0625,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "epoch": 0.13485714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2991194427013397,
+      "learning_rate": 4.951587954676837e-07,
+      "loss": 0.0,
+      "num_tokens": 12480628.0,
+      "reward": 0.6370267868041992,
+      "reward_std": 0.7525250911712646,
+      "rewards/cosine_scaled_reward/mean": -0.056486621499061584,
+      "rewards/cosine_scaled_reward/std": 0.44576171040534973,
+      "rewards/format_reward/mean": 0.75,
+      "rewards/format_reward/std": 0.4364357888698578,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2020.0,
+      "completions/mean_length": 1038.96875,
+      "completions/mean_terminated_length": 894.8214721679688,
+      "completions/min_length": 149.0,
+      "completions/min_terminated_length": 149.0,
+      "epoch": 0.136,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.4483291506767273,
+      "learning_rate": 4.873721045679706e-07,
+      "loss": 0.0,
+      "num_tokens": 12557530.0,
+      "reward": 0.9855979084968567,
+      "reward_std": 0.6055079698562622,
+      "rewards/cosine_scaled_reward/mean": 0.04748644679784775,
+      "rewards/cosine_scaled_reward/std": 0.47108832001686096,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1947.0,
+      "completions/mean_length": 856.578125,
+      "completions/mean_terminated_length": 818.1451416015625,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 0.13714285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3406151831150055,
+      "learning_rate": 4.79604490731896e-07,
+      "loss": -0.0,
+      "num_tokens": 12622807.0,
+      "reward": 0.7979192733764648,
+      "reward_std": 0.6180044412612915,
+      "rewards/cosine_scaled_reward/mean": -0.10104038566350937,
+      "rewards/cosine_scaled_reward/std": 0.44317325949668884,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1764.0,
+      "completions/mean_length": 726.34375,
+      "completions/mean_terminated_length": 683.7096557617188,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 0.1382857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.4178949296474457,
+      "learning_rate": 4.7185832004988133e-07,
+      "loss": 0.0,
+      "num_tokens": 12678989.0,
+      "reward": 1.161607265472412,
+      "reward_std": 0.6393733024597168,
+      "rewards/cosine_scaled_reward/mean": 0.08080361783504486,
+      "rewards/cosine_scaled_reward/std": 0.5313310027122498,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2003.0,
+      "completions/mean_length": 1133.796875,
+      "completions/mean_terminated_length": 1039.22412109375,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 0.13942857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3333284258842468,
+      "learning_rate": 4.641359520805548e-07,
+      "loss": 0.0,
+      "num_tokens": 12763112.0,
+      "reward": 0.9356573820114136,
+      "reward_std": 0.6247758269309998,
+      "rewards/cosine_scaled_reward/mean": -0.02435879409313202,
+      "rewards/cosine_scaled_reward/std": 0.4759780466556549,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1993.0,
+      "completions/mean_length": 1031.296875,
+      "completions/mean_terminated_length": 981.2950439453125,
+      "completions/min_length": 447.0,
+      "completions/min_terminated_length": 447.0,
+      "epoch": 0.14057142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29939791560173035,
+      "learning_rate": 4.5643973913200837e-07,
+      "loss": -0.0,
+      "num_tokens": 12839347.0,
+      "reward": 0.7725162506103516,
+      "reward_std": 0.5560778379440308,
+      "rewards/cosine_scaled_reward/mean": -0.09811685979366302,
+      "rewards/cosine_scaled_reward/std": 0.3822804391384125,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2011.0,
+      "completions/mean_length": 979.234375,
+      "completions/mean_terminated_length": 944.758056640625,
+      "completions/min_length": 274.0,
+      "completions/min_terminated_length": 274.0,
+      "epoch": 0.1417142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34992095828056335,
+      "learning_rate": 4.4877202554526084e-07,
+      "loss": 0.0,
+      "num_tokens": 12912970.0,
+      "reward": 1.085427165031433,
+      "reward_std": 0.6837464570999146,
+      "rewards/cosine_scaled_reward/mean": 0.05052608996629715,
+      "rewards/cosine_scaled_reward/std": 0.4791998267173767,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1993.0,
+      "completions/mean_length": 1076.40625,
+      "completions/mean_terminated_length": 994.0678100585938,
+      "completions/min_length": 309.0,
+      "completions/min_terminated_length": 309.0,
+      "epoch": 0.14285714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27060386538505554,
+      "learning_rate": 4.4113514698014953e-07,
+      "loss": -0.0,
+      "num_tokens": 12992788.0,
+      "reward": 1.0397578477859497,
+      "reward_std": 0.43823006749153137,
+      "rewards/cosine_scaled_reward/mean": 0.019878946244716644,
+      "rewards/cosine_scaled_reward/std": 0.46214956045150757,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1856.0,
+      "completions/mean_length": 1071.53125,
+      "completions/mean_terminated_length": 1006.4334106445312,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "epoch": 0.144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2776121497154236,
+      "learning_rate": 4.3353142970386557e-07,
+      "loss": 0.0,
+      "num_tokens": 13072662.0,
+      "reward": 1.0028693675994873,
+      "reward_std": 0.6879971027374268,
+      "rewards/cosine_scaled_reward/mean": 0.0014346465468406677,
+      "rewards/cosine_scaled_reward/std": 0.42488595843315125,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1868.0,
+      "completions/mean_length": 1180.484375,
+      "completions/mean_terminated_length": 1056.5535888671875,
+      "completions/min_length": 285.0,
+      "completions/min_terminated_length": 285.0,
+      "epoch": 0.14514285714285713,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2829054594039917,
+      "learning_rate": 4.2596318988235037e-07,
+      "loss": -0.0,
+      "num_tokens": 13159309.0,
+      "reward": 0.6576684713363647,
+      "reward_std": 0.66895592212677,
+      "rewards/cosine_scaled_reward/mean": -0.15554077923297882,
+      "rewards/cosine_scaled_reward/std": 0.3959099054336548,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1869.0,
+      "completions/mean_length": 1053.328125,
+      "completions/mean_terminated_length": 950.4310302734375,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "epoch": 0.1462857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29738253355026245,
+      "learning_rate": 4.1843273287476854e-07,
+      "loss": -0.0,
+      "num_tokens": 13237074.0,
+      "reward": 0.8851851224899292,
+      "reward_std": 0.7390589118003845,
+      "rewards/cosine_scaled_reward/mean": -0.041782446205616,
+      "rewards/cosine_scaled_reward/std": 0.46901625394821167,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1927.0,
+      "completions/mean_length": 1228.484375,
+      "completions/mean_terminated_length": 1111.4107666015625,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "epoch": 0.14742857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25943535566329956,
+      "learning_rate": 4.1094235253127374e-07,
+      "loss": -0.0,
+      "num_tokens": 13326401.0,
+      "reward": 0.9628820419311523,
+      "reward_std": 0.6490253210067749,
+      "rewards/cosine_scaled_reward/mean": 0.004878522828221321,
+      "rewards/cosine_scaled_reward/std": 0.45456331968307495,
+      "rewards/format_reward/mean": 0.953125,
+      "rewards/format_reward/std": 0.21304203569889069,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1967.0,
+      "completions/mean_length": 1089.578125,
+      "completions/mean_terminated_length": 952.6607666015625,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "epoch": 0.14857142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3009719252586365,
+      "learning_rate": 4.034943304942796e-07,
+      "loss": 0.0,
+      "num_tokens": 13406638.0,
+      "reward": 0.5984547138214111,
+      "reward_std": 0.7008002996444702,
+      "rewards/cosine_scaled_reward/mean": -0.14608514308929443,
+      "rewards/cosine_scaled_reward/std": 0.37894922494888306,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1651.0,
+      "completions/mean_length": 1058.03125,
+      "completions/mean_terminated_length": 916.607177734375,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "epoch": 0.14971428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.306725412607193,
+      "learning_rate": 3.9609093550344907e-07,
+      "loss": 0.0,
+      "num_tokens": 13484088.0,
+      "reward": 1.0469268560409546,
+      "reward_std": 0.6023457050323486,
+      "rewards/cosine_scaled_reward/mean": 0.0703384131193161,
+      "rewards/cosine_scaled_reward/std": 0.47298464179039,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1773.0,
+      "completions/mean_length": 1342.78125,
+      "completions/mean_terminated_length": 919.6500244140625,
+      "completions/min_length": 366.0,
+      "completions/min_terminated_length": 366.0,
+      "epoch": 0.15085714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3032574951648712,
+      "learning_rate": 3.8873442270461485e-07,
+      "loss": -0.0,
+      "num_tokens": 13581090.0,
+      "reward": 0.4643245339393616,
+      "reward_std": 0.7533800601959229,
+      "rewards/cosine_scaled_reward/mean": -0.06471271812915802,
+      "rewards/cosine_scaled_reward/std": 0.4610835611820221,
+      "rewards/format_reward/mean": 0.59375,
+      "rewards/format_reward/std": 0.49501484632492065,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1974.0,
+      "completions/mean_length": 1144.921875,
+      "completions/mean_terminated_length": 957.4906005859375,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "epoch": 0.152,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.32285141944885254,
+      "learning_rate": 3.8142703296283953e-07,
+      "loss": 0.0,
+      "num_tokens": 13665589.0,
+      "reward": 0.5014957189559937,
+      "reward_std": 0.5352932214736938,
+      "rewards/cosine_scaled_reward/mean": -0.17112717032432556,
+      "rewards/cosine_scaled_reward/std": 0.28127768635749817,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1965.0,
+      "completions/mean_length": 975.53125,
+      "completions/mean_terminated_length": 958.5079956054688,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 0.15314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.40716752409935,
+      "learning_rate": 3.7417099217982686e-07,
+      "loss": -0.0,
+      "num_tokens": 13738591.0,
+      "reward": 1.1759617328643799,
+      "reward_std": 0.4804629683494568,
+      "rewards/cosine_scaled_reward/mean": 0.08798093348741531,
+      "rewards/cosine_scaled_reward/std": 0.5343761444091797,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1686.0,
+      "completions/max_terminated_length": 1686.0,
+      "completions/mean_length": 758.515625,
+      "completions/mean_terminated_length": 758.515625,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 0.15428571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.42696353793144226,
+      "learning_rate": 3.6696851061588994e-07,
+      "loss": -0.0,
+      "num_tokens": 13797608.0,
+      "reward": 1.3851683139801025,
+      "reward_std": 0.5234883427619934,
+      "rewards/cosine_scaled_reward/mean": 0.19258417189121246,
+      "rewards/cosine_scaled_reward/std": 0.49346473813056946,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2007.0,
+      "completions/mean_length": 1169.875,
+      "completions/mean_terminated_length": 1095.4576416015625,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "epoch": 0.15542857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28027620911598206,
+      "learning_rate": 3.5982178221668533e-07,
+      "loss": -0.0,
+      "num_tokens": 13883152.0,
+      "reward": 1.0174503326416016,
+      "reward_std": 0.5889347791671753,
+      "rewards/cosine_scaled_reward/mean": 0.016537662595510483,
+      "rewards/cosine_scaled_reward/std": 0.4763922095298767,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1814.0,
+      "completions/mean_length": 1105.3125,
+      "completions/mean_terminated_length": 1042.4666748046875,
+      "completions/min_length": 446.0,
+      "completions/min_terminated_length": 446.0,
+      "epoch": 0.15657142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3002299666404724,
+      "learning_rate": 3.5273298394491515e-07,
+      "loss": 0.0,
+      "num_tokens": 13964500.0,
+      "reward": 0.841381847858429,
+      "reward_std": 0.6354345083236694,
+      "rewards/cosine_scaled_reward/mean": -0.07149658352136612,
+      "rewards/cosine_scaled_reward/std": 0.4138363003730774,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1983.0,
+      "completions/mean_length": 1125.484375,
+      "completions/mean_terminated_length": 974.5272216796875,
+      "completions/min_length": 361.0,
+      "completions/min_terminated_length": 361.0,
+      "epoch": 0.15771428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28766506910324097,
+      "learning_rate": 3.45704275117204e-07,
+      "loss": -0.0,
+      "num_tokens": 14047843.0,
+      "reward": 0.8758631944656372,
+      "reward_std": 0.7212573289871216,
+      "rewards/cosine_scaled_reward/mean": -0.05425591766834259,
+      "rewards/cosine_scaled_reward/std": 0.4783853590488434,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2023.0,
+      "completions/mean_length": 1216.171875,
+      "completions/mean_terminated_length": 1160.7166748046875,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "epoch": 0.15885714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2882857024669647,
+      "learning_rate": 3.387377967463493e-07,
+      "loss": -0.0,
+      "num_tokens": 14136318.0,
+      "reward": 0.7189284563064575,
+      "reward_std": 0.4593912959098816,
+      "rewards/cosine_scaled_reward/mean": -0.13272328674793243,
+      "rewards/cosine_scaled_reward/std": 0.33584704995155334,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2017.0,
+      "completions/mean_length": 1142.140625,
+      "completions/mean_terminated_length": 1012.732177734375,
+      "completions/min_length": 389.0,
+      "completions/min_terminated_length": 389.0,
+      "epoch": 0.16,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.3000667095184326,
+      "learning_rate": 3.3183567088914833e-07,
+      "loss": 0.0,
+      "num_tokens": 14219639.0,
+      "reward": 0.8278639316558838,
+      "reward_std": 0.46724599599838257,
+      "rewards/cosine_scaled_reward/mean": -0.03919300064444542,
+      "rewards/cosine_scaled_reward/std": 0.4650508463382721,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1919.0,
+      "completions/mean_length": 1025.421875,
+      "completions/mean_terminated_length": 975.131103515625,
+      "completions/min_length": 394.0,
+      "completions/min_terminated_length": 394.0,
+      "epoch": 0.16114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3207882046699524,
+      "learning_rate": 3.250000000000001e-07,
+      "loss": 0.0,
+      "num_tokens": 14295826.0,
+      "reward": 0.8871637582778931,
+      "reward_std": 0.6538586616516113,
+      "rewards/cosine_scaled_reward/mean": -0.04079316183924675,
+      "rewards/cosine_scaled_reward/std": 0.43451616168022156,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1984.0,
+      "completions/mean_length": 1233.90625,
+      "completions/mean_terminated_length": 1149.689697265625,
+      "completions/min_length": 278.0,
+      "completions/min_terminated_length": 278.0,
+      "epoch": 0.16228571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3009903132915497,
+      "learning_rate": 3.182328662904756e-07,
+      "loss": 0.0,
+      "num_tokens": 14385300.0,
+      "reward": 0.8573208451271057,
+      "reward_std": 0.6099269390106201,
+      "rewards/cosine_scaled_reward/mean": -0.055714573711156845,
+      "rewards/cosine_scaled_reward/std": 0.43728360533714294,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1946.0,
+      "completions/mean_length": 1136.078125,
+      "completions/mean_terminated_length": 1005.8035888671875,
+      "completions/min_length": 415.0,
+      "completions/min_terminated_length": 415.0,
+      "epoch": 0.16342857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31794917583465576,
+      "learning_rate": 3.115363310950578e-07,
+      "loss": 0.0,
+      "num_tokens": 14468825.0,
+      "reward": 0.6553314924240112,
+      "reward_std": 0.6344339847564697,
+      "rewards/cosine_scaled_reward/mean": -0.11764675378799438,
+      "rewards/cosine_scaled_reward/std": 0.3099633455276489,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1947.0,
+      "completions/mean_length": 1220.6875,
+      "completions/mean_terminated_length": 1029.769287109375,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 0.16457142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3814108967781067,
+      "learning_rate": 3.0491243424323783e-07,
+      "loss": 0.0,
+      "num_tokens": 14558437.0,
+      "reward": 0.7285318970680237,
+      "reward_std": 0.8925961256027222,
+      "rewards/cosine_scaled_reward/mean": -0.05760904401540756,
+      "rewards/cosine_scaled_reward/std": 0.492266446352005,
+      "rewards/format_reward/mean": 0.84375,
+      "rewards/format_reward/std": 0.36596253514289856,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1880.0,
+      "completions/mean_length": 969.796875,
+      "completions/mean_terminated_length": 916.7704467773438,
+      "completions/min_length": 275.0,
+      "completions/min_terminated_length": 275.0,
+      "epoch": 0.1657142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3201180398464203,
+      "learning_rate": 2.9836319343816397e-07,
+      "loss": -0.0,
+      "num_tokens": 14630448.0,
+      "reward": 0.8149441480636597,
+      "reward_std": 0.5824600458145142,
+      "rewards/cosine_scaled_reward/mean": -0.08471541851758957,
+      "rewards/cosine_scaled_reward/std": 0.475755512714386,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1943.0,
+      "completions/mean_length": 1034.484375,
+      "completions/mean_terminated_length": 966.9166870117188,
+      "completions/min_length": 482.0,
+      "completions/min_terminated_length": 482.0,
+      "epoch": 0.16685714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28184273838996887,
+      "learning_rate": 2.918906036420294e-07,
+      "loss": -0.0,
+      "num_tokens": 14707271.0,
+      "reward": 0.8387603759765625,
+      "reward_std": 0.5346506237983704,
+      "rewards/cosine_scaled_reward/mean": -0.07280732691287994,
+      "rewards/cosine_scaled_reward/std": 0.43024110794067383,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1980.0,
+      "completions/mean_length": 1249.984375,
+      "completions/mean_terminated_length": 1046.568603515625,
+      "completions/min_length": 550.0,
+      "completions/min_terminated_length": 550.0,
+      "epoch": 0.168,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.32145801186561584,
+      "learning_rate": 2.854966364683872e-07,
+      "loss": 0.0,
+      "num_tokens": 14798054.0,
+      "reward": 0.7505484819412231,
+      "reward_std": 0.5473448634147644,
+      "rewards/cosine_scaled_reward/mean": -0.07003828883171082,
+      "rewards/cosine_scaled_reward/std": 0.4046306014060974,
+      "rewards/format_reward/mean": 0.890625,
+      "rewards/format_reward/std": 0.3145764470100403,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1844.0,
+      "completions/mean_length": 1062.828125,
+      "completions/mean_terminated_length": 960.913818359375,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "epoch": 0.16914285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2667451500892639,
+      "learning_rate": 2.791832395815782e-07,
+      "loss": -0.0,
+      "num_tokens": 14877259.0,
+      "reward": 0.7823130488395691,
+      "reward_std": 0.48230016231536865,
+      "rewards/cosine_scaled_reward/mean": -0.06978099048137665,
+      "rewards/cosine_scaled_reward/std": 0.37567150592803955,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2020.0,
+      "completions/mean_length": 1386.875,
+      "completions/mean_terminated_length": 1086.3636474609375,
+      "completions/min_length": 439.0,
+      "completions/min_terminated_length": 439.0,
+      "epoch": 0.1702857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2730913758277893,
+      "learning_rate": 2.729523361034538e-07,
+      "loss": 0.0,
+      "num_tokens": 14977915.0,
+      "reward": 0.48214927315711975,
+      "reward_std": 0.8376681804656982,
+      "rewards/cosine_scaled_reward/mean": -0.14173786342144012,
+      "rewards/cosine_scaled_reward/std": 0.4272434711456299,
+      "rewards/format_reward/mean": 0.765625,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1831.0,
+      "completions/mean_length": 994.15625,
+      "completions/mean_terminated_length": 942.3278198242188,
+      "completions/min_length": 322.0,
+      "completions/min_terminated_length": 322.0,
+      "epoch": 0.17142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2946690022945404,
+      "learning_rate": 2.6680582402757324e-07,
+      "loss": -0.0,
+      "num_tokens": 15052045.0,
+      "reward": 0.8893749713897705,
+      "reward_std": 0.7130615711212158,
+      "rewards/cosine_scaled_reward/mean": -0.05531252920627594,
+      "rewards/cosine_scaled_reward/std": 0.4389563202857971,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1975.0,
+      "completions/mean_length": 1094.4375,
+      "completions/mean_terminated_length": 917.8518676757812,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 0.17257142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29988256096839905,
+      "learning_rate": 2.6074557564105724e-07,
+      "loss": 0.0,
+      "num_tokens": 15132769.0,
+      "reward": 1.088501214981079,
+      "reward_std": 0.9213382005691528,
+      "rewards/cosine_scaled_reward/mean": 0.10675054788589478,
+      "rewards/cosine_scaled_reward/std": 0.510394811630249,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1976.0,
+      "completions/mean_length": 1024.203125,
+      "completions/mean_terminated_length": 937.440673828125,
+      "completions/min_length": 140.0,
+      "completions/min_terminated_length": 140.0,
+      "epoch": 0.1737142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.46614158153533936,
+      "learning_rate": 2.547734369542718e-07,
+      "loss": -0.0,
+      "num_tokens": 15208982.0,
+      "reward": 0.7280048131942749,
+      "reward_std": 0.706195592880249,
+      "rewards/cosine_scaled_reward/mean": -0.10474759340286255,
+      "rewards/cosine_scaled_reward/std": 0.45987388491630554,
+      "rewards/format_reward/mean": 0.9375,
+      "rewards/format_reward/std": 0.24397502839565277,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1955.0,
+      "completions/mean_length": 1180.234375,
+      "completions/mean_terminated_length": 1056.2679443359375,
+      "completions/min_length": 298.0,
+      "completions/min_terminated_length": 298.0,
+      "epoch": 0.17485714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33488133549690247,
+      "learning_rate": 2.488912271385139e-07,
+      "loss": -0.0,
+      "num_tokens": 15295661.0,
+      "reward": 0.4985957443714142,
+      "reward_std": 0.4677598178386688,
+      "rewards/cosine_scaled_reward/mean": -0.2272646427154541,
+      "rewards/cosine_scaled_reward/std": 0.2307518571615219,
+      "rewards/format_reward/mean": 0.953125,
+      "rewards/format_reward/std": 0.21304203569889069,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2019.0,
+      "completions/mean_length": 1340.296875,
+      "completions/mean_terminated_length": 1142.1400146484375,
+      "completions/min_length": 559.0,
+      "completions/min_terminated_length": 559.0,
+      "epoch": 0.176,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25304633378982544,
+      "learning_rate": 2.4310073797187573e-07,
+      "loss": -0.0,
+      "num_tokens": 15392504.0,
+      "reward": 0.7636169195175171,
+      "reward_std": 0.7114115953445435,
+      "rewards/cosine_scaled_reward/mean": -0.03225403279066086,
+      "rewards/cosine_scaled_reward/std": 0.42686402797698975,
+      "rewards/format_reward/mean": 0.828125,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1909.0,
+      "completions/mean_length": 915.6875,
+      "completions/mean_terminated_length": 798.5516967773438,
+      "completions/min_length": 315.0,
+      "completions/min_terminated_length": 315.0,
+      "epoch": 0.17714285714285713,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.35448068380355835,
+      "learning_rate": 2.374037332934512e-07,
+      "loss": 0.0,
+      "num_tokens": 15461732.0,
+      "reward": 0.736025333404541,
+      "reward_std": 0.5466883182525635,
+      "rewards/cosine_scaled_reward/mean": -0.11636234819889069,
+      "rewards/cosine_scaled_reward/std": 0.43356192111968994,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1913.0,
+      "completions/mean_length": 1157.90625,
+      "completions/mean_terminated_length": 952.5000610351562,
+      "completions/min_length": 93.0,
+      "completions/min_terminated_length": 93.0,
+      "epoch": 0.1782857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.4357910454273224,
+      "learning_rate": 2.3180194846605364e-07,
+      "loss": -0.0,
+      "num_tokens": 15545942.0,
+      "reward": 0.8330824971199036,
+      "reward_std": 0.725536048412323,
+      "rewards/cosine_scaled_reward/mean": -0.02095877379179001,
+      "rewards/cosine_scaled_reward/std": 0.4767586290836334,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2028.0,
+      "completions/mean_length": 1157.75,
+      "completions/mean_terminated_length": 1030.571533203125,
+      "completions/min_length": 485.0,
+      "completions/min_terminated_length": 485.0,
+      "epoch": 0.17942857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29891225695610046,
+      "learning_rate": 2.2629708984760706e-07,
+      "loss": 0.0,
+      "num_tokens": 15629998.0,
+      "reward": 0.6674755811691284,
+      "reward_std": 0.6577311754226685,
+      "rewards/cosine_scaled_reward/mean": -0.13501222431659698,
+      "rewards/cosine_scaled_reward/std": 0.36102381348609924,
+      "rewards/format_reward/mean": 0.9375,
+      "rewards/format_reward/std": 0.24397502839565277,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1856.0,
+      "completions/mean_length": 1013.6875,
+      "completions/mean_terminated_length": 962.8196411132812,
+      "completions/min_length": 324.0,
+      "completions/min_terminated_length": 324.0,
+      "epoch": 0.18057142857142858,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.2723560333251953,
+      "learning_rate": 2.2089083427137329e-07,
+      "loss": 0.0,
+      "num_tokens": 15704994.0,
+      "reward": 0.9709224104881287,
+      "reward_std": 0.48810505867004395,
+      "rewards/cosine_scaled_reward/mean": -0.014538809657096863,
+      "rewards/cosine_scaled_reward/std": 0.4970093369483948,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1881.0,
+      "completions/mean_length": 1081.296875,
+      "completions/mean_terminated_length": 962.5789794921875,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "epoch": 0.18171428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2894439697265625,
+      "learning_rate": 2.1558482853517253e-07,
+      "loss": -0.0,
+      "num_tokens": 15785877.0,
+      "reward": 0.5938807725906372,
+      "reward_std": 0.592242956161499,
+      "rewards/cosine_scaled_reward/mean": -0.16399714350700378,
+      "rewards/cosine_scaled_reward/std": 0.3423241078853607,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1750.0,
+      "completions/mean_length": 968.25,
+      "completions/mean_terminated_length": 915.1474609375,
+      "completions/min_length": 417.0,
+      "completions/min_terminated_length": 417.0,
+      "epoch": 0.18285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3261898159980774,
+      "learning_rate": 2.1038068889975259e-07,
+      "loss": 0.0,
+      "num_tokens": 15859429.0,
+      "reward": 1.2050117254257202,
+      "reward_std": 0.6944217681884766,
+      "rewards/cosine_scaled_reward/mean": 0.10250584781169891,
+      "rewards/cosine_scaled_reward/std": 0.5283173322677612,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2026.0,
+      "completions/mean_length": 1031.75,
+      "completions/mean_terminated_length": 945.6271362304688,
+      "completions/min_length": 414.0,
+      "completions/min_terminated_length": 414.0,
+      "epoch": 0.184,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34274861216545105,
+      "learning_rate": 2.0528000059645995e-07,
+      "loss": -0.0,
+      "num_tokens": 15935453.0,
+      "reward": 0.9563960433006287,
+      "reward_std": 0.6316370964050293,
+      "rewards/cosine_scaled_reward/mean": 0.009448029100894928,
+      "rewards/cosine_scaled_reward/std": 0.46292582154273987,
+      "rewards/format_reward/mean": 0.9375,
+      "rewards/format_reward/std": 0.24397502839565277,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.234375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1891.0,
+      "completions/mean_length": 1167.828125,
+      "completions/mean_terminated_length": 898.3877563476562,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "epoch": 0.18514285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3887297511100769,
+      "learning_rate": 2.0028431734436308e-07,
+      "loss": 0.0,
+      "num_tokens": 16020498.0,
+      "reward": 0.6932262182235718,
+      "reward_std": 0.8278101682662964,
+      "rewards/cosine_scaled_reward/mean": -0.08307439833879471,
+      "rewards/cosine_scaled_reward/std": 0.3847581744194031,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1820.0,
+      "completions/mean_length": 1058.84375,
+      "completions/mean_terminated_length": 956.5172119140625,
+      "completions/min_length": 394.0,
+      "completions/min_terminated_length": 394.0,
+      "epoch": 0.18628571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30917680263519287,
+      "learning_rate": 1.9539516087697517e-07,
+      "loss": 0.0,
+      "num_tokens": 16099448.0,
+      "reward": 1.3529155254364014,
+      "reward_std": 0.8906396627426147,
+      "rewards/cosine_scaled_reward/mean": 0.22333277761936188,
+      "rewards/cosine_scaled_reward/std": 0.5322388410568237,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1974.0,
+      "completions/mean_length": 988.703125,
+      "completions/mean_terminated_length": 918.0833740234375,
+      "completions/min_length": 317.0,
+      "completions/min_terminated_length": 317.0,
+      "epoch": 0.18742857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33646658062934875,
+      "learning_rate": 1.9061402047871833e-07,
+      "loss": 0.0,
+      "num_tokens": 16173253.0,
+      "reward": 1.046778678894043,
+      "reward_std": 0.6892427206039429,
+      "rewards/cosine_scaled_reward/mean": 0.0390143096446991,
+      "rewards/cosine_scaled_reward/std": 0.4476637840270996,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2033.0,
+      "completions/mean_length": 1103.5,
+      "completions/mean_terminated_length": 948.9454345703125,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "epoch": 0.18857142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.338925838470459,
+      "learning_rate": 1.8594235253127372e-07,
+      "loss": -0.0,
+      "num_tokens": 16255293.0,
+      "reward": 0.7887892723083496,
+      "reward_std": 0.6329070329666138,
+      "rewards/cosine_scaled_reward/mean": -0.0665428563952446,
+      "rewards/cosine_scaled_reward/std": 0.4880979061126709,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1714.0,
+      "completions/mean_length": 1166.265625,
+      "completions/mean_terminated_length": 1002.9815063476562,
+      "completions/min_length": 500.0,
+      "completions/min_terminated_length": 500.0,
+      "epoch": 0.18971428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29118841886520386,
+      "learning_rate": 1.8138158006995363e-07,
+      "loss": -0.0,
+      "num_tokens": 16341510.0,
+      "reward": 0.5021259784698486,
+      "reward_std": 0.5949545502662659,
+      "rewards/cosine_scaled_reward/mean": -0.18643701076507568,
+      "rewards/cosine_scaled_reward/std": 0.3388413190841675,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1756.0,
+      "completions/mean_length": 1027.96875,
+      "completions/mean_terminated_length": 922.4482421875,
+      "completions/min_length": 370.0,
+      "completions/min_terminated_length": 370.0,
+      "epoch": 0.19085714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3123703598976135,
+      "learning_rate": 1.7693309235023127e-07,
+      "loss": -0.0,
+      "num_tokens": 16418844.0,
+      "reward": 0.6054480671882629,
+      "reward_std": 0.6668864488601685,
+      "rewards/cosine_scaled_reward/mean": -0.17383846640586853,
+      "rewards/cosine_scaled_reward/std": 0.34976449608802795,
+      "rewards/format_reward/mean": 0.953125,
+      "rewards/format_reward/std": 0.21304203569889069,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2033.0,
+      "completions/mean_length": 1131.890625,
+      "completions/mean_terminated_length": 1086.8360595703125,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "epoch": 0.192,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2515013515949249,
+      "learning_rate": 1.7259824442455923e-07,
+      "loss": 0.0,
+      "num_tokens": 16502125.0,
+      "reward": 0.929424524307251,
+      "reward_std": 0.6242066621780396,
+      "rewards/cosine_scaled_reward/mean": -0.011850237846374512,
+      "rewards/cosine_scaled_reward/std": 0.4718935191631317,
+      "rewards/format_reward/mean": 0.953125,
+      "rewards/format_reward/std": 0.21304203569889069,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1737.0,
+      "completions/mean_length": 908.53125,
+      "completions/mean_terminated_length": 871.774169921875,
+      "completions/min_length": 428.0,
+      "completions/min_terminated_length": 428.0,
+      "epoch": 0.19314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29841023683547974,
+      "learning_rate": 1.6837835672960831e-07,
+      "loss": -0.0,
+      "num_tokens": 16570895.0,
+      "reward": 1.6184587478637695,
+      "reward_std": 0.5710533857345581,
+      "rewards/cosine_scaled_reward/mean": 0.3092293441295624,
+      "rewards/cosine_scaled_reward/std": 0.5226604342460632,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2031.0,
+      "completions/mean_length": 1005.109375,
+      "completions/mean_terminated_length": 834.4545288085938,
+      "completions/min_length": 264.0,
+      "completions/min_terminated_length": 264.0,
+      "epoch": 0.19428571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3179849088191986,
+      "learning_rate": 1.6427471468404952e-07,
+      "loss": -0.0,
+      "num_tokens": 16645006.0,
+      "reward": 1.0071099996566772,
+      "reward_std": 0.3746073246002197,
+      "rewards/cosine_scaled_reward/mean": 0.06605499982833862,
+      "rewards/cosine_scaled_reward/std": 0.4378518760204315,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1891.0,
+      "completions/mean_length": 1234.65625,
+      "completions/mean_terminated_length": 940.4680786132812,
+      "completions/min_length": 500.0,
+      "completions/min_terminated_length": 500.0,
+      "epoch": 0.19542857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2992324233055115,
+      "learning_rate": 1.6028856829700258e-07,
+      "loss": -0.0,
+      "num_tokens": 16734416.0,
+      "reward": 0.7108581066131592,
+      "reward_std": 0.7254206538200378,
+      "rewards/cosine_scaled_reward/mean": -0.02738344669342041,
+      "rewards/cosine_scaled_reward/std": 0.44080549478530884,
+      "rewards/format_reward/mean": 0.765625,
+      "rewards/format_reward/std": 0.42695629596710205,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1638.0,
+      "completions/mean_length": 900.234375,
+      "completions/mean_terminated_length": 823.7167358398438,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "epoch": 0.19657142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.47149336338043213,
+      "learning_rate": 1.5642113178727193e-07,
+      "loss": 0.0,
+      "num_tokens": 16802647.0,
+      "reward": 1.3995718955993652,
+      "reward_std": 0.5902794599533081,
+      "rewards/cosine_scaled_reward/mean": 0.2310360074043274,
+      "rewards/cosine_scaled_reward/std": 0.5026565194129944,
+      "rewards/format_reward/mean": 0.9375,
+      "rewards/format_reward/std": 0.24397502839565277,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2041.0,
+      "completions/mean_length": 925.078125,
+      "completions/mean_terminated_length": 787.1754150390625,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 0.1977142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3893924057483673,
+      "learning_rate": 1.5267358321348285e-07,
+      "loss": 0.0,
+      "num_tokens": 16873164.0,
+      "reward": 0.6720038056373596,
+      "reward_std": 0.667186975479126,
+      "rewards/cosine_scaled_reward/mean": -0.12493559718132019,
+      "rewards/cosine_scaled_reward/std": 0.40216636657714844,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2040.0,
+      "completions/mean_length": 1005.578125,
+      "completions/mean_terminated_length": 971.9515991210938,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 0.19885714285714284,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.39529484510421753,
+      "learning_rate": 1.4904706411523448e-07,
+      "loss": -0.0,
+      "num_tokens": 16947857.0,
+      "reward": 0.9172019958496094,
+      "reward_std": 0.6198633313179016,
+      "rewards/cosine_scaled_reward/mean": -0.03358650952577591,
+      "rewards/cosine_scaled_reward/std": 0.4403606951236725,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1828.0,
+      "completions/mean_length": 952.296875,
+      "completions/mean_terminated_length": 898.4097900390625,
+      "completions/min_length": 321.0,
+      "completions/min_terminated_length": 321.0,
+      "epoch": 0.2,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.322712779045105,
+      "learning_rate": 1.4554267916537495e-07,
+      "loss": 0.0,
+      "num_tokens": 17019628.0,
+      "reward": 0.871549129486084,
+      "reward_std": 0.46009254455566406,
+      "rewards/cosine_scaled_reward/mean": -0.05641293525695801,
+      "rewards/cosine_scaled_reward/std": 0.44415631890296936,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1975.0,
+      "completions/mean_length": 1048.453125,
+      "completions/mean_terminated_length": 945.0516967773438,
+      "completions/min_length": 287.0,
+      "completions/min_terminated_length": 287.0,
+      "epoch": 0.20114285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3287680447101593,
+      "learning_rate": 1.4216149583350755e-07,
+      "loss": -0.0,
+      "num_tokens": 17097897.0,
+      "reward": 0.839117705821991,
+      "reward_std": 0.7753168344497681,
+      "rewards/cosine_scaled_reward/mean": -0.04137861356139183,
+      "rewards/cosine_scaled_reward/std": 0.43453913927078247,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1915.0,
+      "completions/mean_length": 968.34375,
+      "completions/mean_terminated_length": 933.51611328125,
+      "completions/min_length": 505.0,
+      "completions/min_terminated_length": 505.0,
+      "epoch": 0.2022857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3266870677471161,
+      "learning_rate": 1.3890454406082956e-07,
+      "loss": -0.0,
+      "num_tokens": 17170095.0,
+      "reward": 1.0329997539520264,
+      "reward_std": 0.7290528416633606,
+      "rewards/cosine_scaled_reward/mean": 0.024312350898981094,
+      "rewards/cosine_scaled_reward/std": 0.46764034032821655,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2028.0,
+      "completions/mean_length": 1016.0625,
+      "completions/mean_terminated_length": 909.3103637695312,
+      "completions/min_length": 414.0,
+      "completions/min_terminated_length": 414.0,
+      "epoch": 0.20342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.330020546913147,
+      "learning_rate": 1.3577281594640182e-07,
+      "loss": -0.0,
+      "num_tokens": 17246659.0,
+      "reward": 1.1118203401565552,
+      "reward_std": 0.7913287878036499,
+      "rewards/cosine_scaled_reward/mean": 0.07934767752885818,
+      "rewards/cosine_scaled_reward/std": 0.5148099660873413,
+      "rewards/format_reward/mean": 0.953125,
+      "rewards/format_reward/std": 0.21304203569889069,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.234375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1945.0,
+      "completions/mean_length": 1227.78125,
+      "completions/mean_terminated_length": 976.69384765625,
+      "completions/min_length": 270.0,
+      "completions/min_terminated_length": 270.0,
+      "epoch": 0.20457142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33415722846984863,
+      "learning_rate": 1.3276726544494571e-07,
+      "loss": 0.0,
+      "num_tokens": 17336069.0,
+      "reward": 0.608305037021637,
+      "reward_std": 0.5569274425506592,
+      "rewards/cosine_scaled_reward/mean": -0.10991000384092331,
+      "rewards/cosine_scaled_reward/std": 0.3418741822242737,
+      "rewards/format_reward/mean": 0.828125,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1932.0,
+      "completions/mean_length": 1024.46875,
+      "completions/mean_terminated_length": 956.2333984375,
+      "completions/min_length": 264.0,
+      "completions/min_terminated_length": 264.0,
+      "epoch": 0.2057142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3402194082736969,
+      "learning_rate": 1.2988880807625927e-07,
+      "loss": -0.0,
+      "num_tokens": 17412811.0,
+      "reward": 1.6137604713439941,
+      "reward_std": 0.8008866310119629,
+      "rewards/cosine_scaled_reward/mean": 0.31469273567199707,
+      "rewards/cosine_scaled_reward/std": 0.5089212656021118,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2001.0,
+      "completions/mean_length": 1157.546875,
+      "completions/mean_terminated_length": 992.6481323242188,
+      "completions/min_length": 387.0,
+      "completions/min_terminated_length": 387.0,
+      "epoch": 0.20685714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29356250166893005,
+      "learning_rate": 1.2713832064634125e-07,
+      "loss": -0.0,
+      "num_tokens": 17498366.0,
+      "reward": 0.7507010698318481,
+      "reward_std": 0.5088521242141724,
+      "rewards/cosine_scaled_reward/mean": -0.07777446508407593,
+      "rewards/cosine_scaled_reward/std": 0.4100310504436493,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.234375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1840.0,
+      "completions/mean_length": 1166.390625,
+      "completions/mean_terminated_length": 896.5101928710938,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 0.208,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2653217613697052,
+      "learning_rate": 1.2451664098030743e-07,
+      "loss": -0.0,
+      "num_tokens": 17582807.0,
+      "reward": 0.7447050213813782,
+      "reward_std": 0.8267481327056885,
+      "rewards/cosine_scaled_reward/mean": -0.04170997440814972,
+      "rewards/cosine_scaled_reward/std": 0.4390917420387268,
+      "rewards/format_reward/mean": 0.828125,
+      "rewards/format_reward/std": 0.38025420904159546,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1843.0,
+      "completions/mean_length": 1003.203125,
+      "completions/mean_terminated_length": 933.550048828125,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "epoch": 0.20914285714285713,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3268946707248688,
+      "learning_rate": 1.220245676671809e-07,
+      "loss": 0.0,
+      "num_tokens": 17657628.0,
+      "reward": 1.0635898113250732,
+      "reward_std": 0.5967966914176941,
+      "rewards/cosine_scaled_reward/mean": 0.039607420563697815,
+      "rewards/cosine_scaled_reward/std": 0.43730178475379944,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1941.0,
+      "completions/mean_length": 1025.171875,
+      "completions/mean_terminated_length": 938.4915161132812,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 0.2102857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.36050307750701904,
+      "learning_rate": 1.1966285981663407e-07,
+      "loss": 0.0,
+      "num_tokens": 17734591.0,
+      "reward": 0.6448719501495361,
+      "reward_std": 0.503462553024292,
+      "rewards/cosine_scaled_reward/mean": -0.14631402492523193,
+      "rewards/cosine_scaled_reward/std": 0.3733954429626465,
+      "rewards/format_reward/mean": 0.9375,
+      "rewards/format_reward/std": 0.24397502839565277,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1779.0,
+      "completions/mean_length": 969.015625,
+      "completions/mean_terminated_length": 934.2096557617188,
+      "completions/min_length": 107.0,
+      "completions/min_terminated_length": 107.0,
+      "epoch": 0.21142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.42219310998916626,
+      "learning_rate": 1.1743223682775649e-07,
+      "loss": -0.0,
+      "num_tokens": 17806792.0,
+      "reward": 0.7470877766609192,
+      "reward_std": 0.5973426103591919,
+      "rewards/cosine_scaled_reward/mean": -0.11864358186721802,
+      "rewards/cosine_scaled_reward/std": 0.41184645891189575,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1667.0,
+      "completions/mean_length": 1076.984375,
+      "completions/mean_terminated_length": 938.2678833007812,
+      "completions/min_length": 404.0,
+      "completions/min_terminated_length": 404.0,
+      "epoch": 0.21257142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30614498257637024,
+      "learning_rate": 1.1533337816991931e-07,
+      "loss": -0.0,
+      "num_tokens": 17886415.0,
+      "reward": 0.804481029510498,
+      "reward_std": 0.4629480838775635,
+      "rewards/cosine_scaled_reward/mean": -0.03525950014591217,
+      "rewards/cosine_scaled_reward/std": 0.45060867071151733,
+      "rewards/format_reward/mean": 0.875,
+      "rewards/format_reward/std": 0.3333333432674408,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1879.0,
+      "completions/mean_length": 1112.1875,
+      "completions/mean_terminated_length": 1049.800048828125,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 0.21371428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.4846937656402588,
+      "learning_rate": 1.1336692317580158e-07,
+      "loss": 0.0,
+      "num_tokens": 17968019.0,
+      "reward": 0.6981200575828552,
+      "reward_std": 0.53022301197052,
+      "rewards/cosine_scaled_reward/mean": -0.1275024712085724,
+      "rewards/cosine_scaled_reward/std": 0.38560083508491516,
+      "rewards/format_reward/mean": 0.953125,
+      "rewards/format_reward/std": 0.21304203569889069,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1910.0,
+      "completions/mean_length": 1079.90625,
+      "completions/mean_terminated_length": 997.8643798828125,
+      "completions/min_length": 295.0,
+      "completions/min_terminated_length": 295.0,
+      "epoch": 0.21485714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34140780568122864,
+      "learning_rate": 1.1153347084664419e-07,
+      "loss": -0.0,
+      "num_tokens": 18048933.0,
+      "reward": 0.5326423645019531,
+      "reward_std": 0.5487440824508667,
+      "rewards/cosine_scaled_reward/mean": -0.22586631774902344,
+      "rewards/cosine_scaled_reward/std": 0.3085760772228241,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2002.0,
+      "completions/mean_length": 868.546875,
+      "completions/mean_terminated_length": 830.5,
+      "completions/min_length": 37.0,
+      "completions/min_terminated_length": 37.0,
+      "epoch": 0.216,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.6607878804206848,
+      "learning_rate": 1.0983357966978745e-07,
+      "loss": -0.0,
+      "num_tokens": 18113808.0,
+      "reward": 0.7490335702896118,
+      "reward_std": 0.6654466390609741,
+      "rewards/cosine_scaled_reward/mean": -0.11767073720693588,
+      "rewards/cosine_scaled_reward/std": 0.4015049338340759,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1713.0,
+      "completions/mean_length": 938.8125,
+      "completions/mean_terminated_length": 903.0322265625,
+      "completions/min_length": 359.0,
+      "completions/min_terminated_length": 359.0,
+      "epoch": 0.21714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3069080710411072,
+      "learning_rate": 1.0826776744855121e-07,
+      "loss": -0.0,
+      "num_tokens": 18183660.0,
+      "reward": 0.9838922023773193,
+      "reward_std": 0.5085676908493042,
+      "rewards/cosine_scaled_reward/mean": -0.00024138391017913818,
+      "rewards/cosine_scaled_reward/std": 0.44459760189056396,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1851.0,
+      "completions/max_terminated_length": 1851.0,
+      "completions/mean_length": 902.453125,
+      "completions/mean_terminated_length": 902.453125,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "epoch": 0.21828571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.35081905126571655,
+      "learning_rate": 1.068365111445064e-07,
+      "loss": 0.0,
+      "num_tokens": 18251705.0,
+      "reward": 1.247175931930542,
+      "reward_std": 0.8716963529586792,
+      "rewards/cosine_scaled_reward/mean": 0.13140051066875458,
+      "rewards/cosine_scaled_reward/std": 0.5292099118232727,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2022.0,
+      "completions/mean_length": 1270.0,
+      "completions/mean_terminated_length": 1052.1600341796875,
+      "completions/min_length": 427.0,
+      "completions/min_terminated_length": 427.0,
+      "epoch": 0.21942857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2882588505744934,
+      "learning_rate": 1.0554024673218806e-07,
+      "loss": -0.0,
+      "num_tokens": 18344281.0,
+      "reward": 0.5913476943969727,
+      "reward_std": 0.6203497052192688,
+      "rewards/cosine_scaled_reward/mean": -0.11057613790035248,
+      "rewards/cosine_scaled_reward/std": 0.33690571784973145,
+      "rewards/format_reward/mean": 0.8125,
+      "rewards/format_reward/std": 0.39339789748191833,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1806.0,
+      "completions/mean_length": 1134.3125,
+      "completions/mean_terminated_length": 1022.1052856445312,
+      "completions/min_length": 475.0,
+      "completions/min_terminated_length": 475.0,
+      "epoch": 0.22057142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30495956540107727,
+      "learning_rate": 1.0437936906629334e-07,
+      "loss": 0.0,
+      "num_tokens": 18428021.0,
+      "reward": 0.9724597930908203,
+      "reward_std": 0.6338238716125488,
+      "rewards/cosine_scaled_reward/mean": 0.025292381644248962,
+      "rewards/cosine_scaled_reward/std": 0.47308972477912903,
+      "rewards/format_reward/mean": 0.921875,
+      "rewards/format_reward/std": 0.27048972249031067,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.234375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2022.0,
+      "completions/mean_length": 1284.34375,
+      "completions/mean_terminated_length": 1050.5714111328125,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "epoch": 0.22171428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29666370153427124,
+      "learning_rate": 1.0335423176140511e-07,
+      "loss": -0.0,
+      "num_tokens": 18521579.0,
+      "reward": 0.970361590385437,
+      "reward_std": 0.8541973829269409,
+      "rewards/cosine_scaled_reward/mean": 0.055493295192718506,
+      "rewards/cosine_scaled_reward/std": 0.5139825344085693,
+      "rewards/format_reward/mean": 0.859375,
+      "rewards/format_reward/std": 0.3503824472427368,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1614.0,
+      "completions/mean_length": 1111.140625,
+      "completions/mean_terminated_length": 957.8363037109375,
+      "completions/min_length": 612.0,
+      "completions/min_terminated_length": 612.0,
+      "epoch": 0.22285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2935192883014679,
+      "learning_rate": 1.0246514708427701e-07,
+      "loss": -0.0,
+      "num_tokens": 18603836.0,
+      "reward": 0.9238024353981018,
+      "reward_std": 0.7688024044036865,
+      "rewards/cosine_scaled_reward/mean": 0.008776212111115456,
+      "rewards/cosine_scaled_reward/std": 0.4346567392349243,
+      "rewards/format_reward/mean": 0.90625,
+      "rewards/format_reward/std": 0.29378482699394226,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1913.0,
+      "completions/mean_length": 1110.28125,
+      "completions/mean_terminated_length": 1064.163818359375,
+      "completions/min_length": 422.0,
+      "completions/min_terminated_length": 422.0,
+      "epoch": 0.224,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31850409507751465,
+      "learning_rate": 1.017123858587145e-07,
+      "loss": 0.0,
+      "num_tokens": 18686486.0,
+      "reward": 1.0064561367034912,
+      "reward_std": 0.6142268776893616,
+      "rewards/cosine_scaled_reward/mean": 0.0032280460000038147,
+      "rewards/cosine_scaled_reward/std": 0.4689313769340515,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1862.0,
+      "completions/max_terminated_length": 1862.0,
+      "completions/mean_length": 867.390625,
+      "completions/mean_terminated_length": 867.390625,
+      "completions/min_length": 294.0,
+      "completions/min_terminated_length": 294.0,
+      "epoch": 0.22514285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.36897119879722595,
+      "learning_rate": 1.0109617738307911e-07,
+      "loss": 0.0,
+      "num_tokens": 18752367.0,
+      "reward": 1.2200298309326172,
+      "reward_std": 0.7840542197227478,
+      "rewards/cosine_scaled_reward/mean": 0.11001493036746979,
+      "rewards/cosine_scaled_reward/std": 0.5105303525924683,
+      "rewards/format_reward/mean": 1.0,
+      "rewards/format_reward/std": 0.0,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2032.0,
+      "completions/mean_length": 1018.171875,
+      "completions/mean_terminated_length": 911.637939453125,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 0.22628571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33654487133026123,
+      "learning_rate": 1.0061670936044178e-07,
+      "loss": 0.0,
+      "num_tokens": 18829034.0,
+      "reward": 1.0653846263885498,
+      "reward_std": 0.7624523043632507,
+      "rewards/cosine_scaled_reward/mean": 0.04831730201840401,
+      "rewards/cosine_scaled_reward/std": 0.4961619973182678,
+      "rewards/format_reward/mean": 0.96875,
+      "rewards/format_reward/std": 0.17536810040473938,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2038.0,
+      "completions/mean_length": 1170.84375,
+      "completions/mean_terminated_length": 1096.5084228515625,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "epoch": 0.22742857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28278952836990356,
+      "learning_rate": 1.002741278414069e-07,
+      "loss": 0.0,
+      "num_tokens": 18915472.0,
+      "reward": 0.6831471920013428,
+      "reward_std": 0.6951984167098999,
+      "rewards/cosine_scaled_reward/mean": -0.1506139189004898,
+      "rewards/cosine_scaled_reward/std": 0.34608688950538635,
+      "rewards/format_reward/mean": 0.984375,
+      "rewards/format_reward/std": 0.125,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1764.0,
+      "completions/mean_length": 999.390625,
+      "completions/mean_terminated_length": 849.5892944335938,
+      "completions/min_length": 395.0,
+      "completions/min_terminated_length": 395.0,
+      "epoch": 0.22857142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28817513585090637,
+      "learning_rate": 1.0006853717962393e-07,
+      "loss": 0.0,
+      "num_tokens": 18989553.0,
+      "reward": 0.9030377864837646,
+      "reward_std": 0.8171917200088501,
+      "rewards/cosine_scaled_reward/mean": -0.01723114401102066,
+      "rewards/cosine_scaled_reward/std": 0.4829805791378021,
+      "rewards/format_reward/mean": 0.9375,
+      "rewards/format_reward/std": 0.24397502839565277,
+      "step": 200
+    },
+    {
+      "epoch": 0.22857142857142856,
+      "step": 200,
+      "total_flos": 0.0,
+      "train_loss": 3.2957177609205244e-09,
+      "train_runtime": 10011.2078,
+      "train_samples_per_second": 1.279,
+      "train_steps_per_second": 0.02
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 200,
+  "num_input_tokens_seen": 18989553,
+  "num_train_epochs": 1,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/training_args.bin b/training_args.bin
new file mode 100644
index 0000000..9e03ee7
--- /dev/null
+++ b/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec4fbe7e29ae78abab9c9fd5555870c1bffe7656ceef4ac5fa2148a15b61b1e3
+size 8888