commit ee5ddba378f5173d26a1ffcae5b5ef7585790ebc Author: ModelHub XC Date: Tue May 12 12:06:33 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: LLucass/TT_L0.2_H0.2_grpo Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..478a303 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,40 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-50/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-150/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..89f1efb --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +datasets: knoveleng/open-rs +library_name: transformers +model_name: TT_L0.2_H0.2_grpo +tags: +- generated_from_trainer +- open-r1 +- trl +- grpo +licence: license +--- + +# Model Card for TT_L0.2_H0.2_grpo + +This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) on the [knoveleng/open-rs](https://huggingface.co/datasets/knoveleng/open-rs) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="LLucass/TT_L0.2_H0.2_grpo", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/lavatorywang-nus/uncertainty/runs/9gj0wo7b) + + +This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300). + +### Framework versions + +- TRL: 0.16.0.dev0 +- Transformers: 4.51.3 +- Pytorch: 2.5.1 +- Datasets: 3.6.0 +- Tokenizers: 0.21.1 + +## Citations + +Cite GRPO as: + +```bibtex +@article{zhihong2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, +} + +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..2c27fe5 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": 3.2957177609205244e-09, + "train_runtime": 10011.2078, + "train_samples": 7000, + "train_samples_per_second": 1.279, + "train_steps_per_second": 0.02 +} \ No newline at end of file diff --git a/checkpoint-100/config.json b/checkpoint-100/config.json new file mode 100644 index 0000000..78fed5b --- /dev/null +++ b/checkpoint-100/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": 4096, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "use_cache": false, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/checkpoint-100/generation_config.json b/checkpoint-100/generation_config.json new file mode 100644 index 0000000..92878bd --- /dev/null +++ b/checkpoint-100/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 151646, + "do_sample": true, + "eos_token_id": 151643, + "temperature": 0.6, + "top_p": 0.95, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..e4cbb55 --- /dev/null +++ b/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64d03fed17192dbd02910a453a129df1e59f0dc56bb9beb6c12fb15fd5a9c1de +size 5331274140 diff --git a/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..f1bc753 --- /dev/null +++ b/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f2840c47421f7c64caa11da52c0631d0552552ce75a900497c6cac91258525f +size 5331276572 diff --git a/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..c099b37 --- /dev/null +++ b/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aa79f325e1d1a7ccdd958c5099c83a7b19d5914ea40bc4442eaa921c39f052a +size 5331276892 diff --git a/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..d51e3cf --- /dev/null +++ b/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99cca373a8c3f9a69295ac81f0febe4fbb3d279ceeec4d3de63743860f11bf47 +size 5331273884 diff --git a/checkpoint-100/global_step100/mp_rank_00_model_states.pt b/checkpoint-100/global_step100/mp_rank_00_model_states.pt new file mode 100644 index 0000000..f82ca69 --- /dev/null +++ b/checkpoint-100/global_step100/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e065897058412099161e568c02a323c6b14f44d563d9449bf43b5440be39c020 +size 3554267640 diff --git a/checkpoint-100/latest b/checkpoint-100/latest new file mode 100644 index 0000000..744ae7d --- /dev/null +++ b/checkpoint-100/latest @@ -0,0 +1 @@ +global_step100 \ No newline at end of file diff --git a/checkpoint-100/model.safetensors b/checkpoint-100/model.safetensors new file mode 100644 index 0000000..1fb28a4 --- /dev/null +++ b/checkpoint-100/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8af2fd262f18c1787af93c8074f28997a45321d177fa5cd04da9d1387b9d7563 +size 3554214752 diff --git a/checkpoint-100/rng_state_0.pth b/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000..f388e2d --- /dev/null +++ b/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be658a6ef1d1c437376e35713827537091c6f33daa9df25eaa9e98991c241626 +size 14960 diff --git a/checkpoint-100/rng_state_1.pth b/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000..b8609e2 --- /dev/null +++ b/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aed16eab73104db9391a6f908bd2c021091c25d0695a238c0b37b35f57381747 +size 14960 diff --git a/checkpoint-100/rng_state_2.pth b/checkpoint-100/rng_state_2.pth new file mode 100644 index 0000000..3faa9f7 --- /dev/null +++ b/checkpoint-100/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3391aacd8861abedf83b6b4f8fd56e9bda6ca4b7e62a19f0364fc11c5fb64740 +size 14960 diff --git a/checkpoint-100/rng_state_3.pth b/checkpoint-100/rng_state_3.pth new file mode 100644 index 0000000..a42199a --- /dev/null +++ b/checkpoint-100/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:985fd6b7842de914d0250ac9bd68ec694b87f6fc3fae0cbcb3027f4ca123d938 +size 14960 diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt new file mode 100644 index 0000000..256e515 --- /dev/null +++ b/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b803ddcc2d4f1857fc5f89309ee5e404cb000416c8d7f3e16508a024742ba28a +size 1064 diff --git a/checkpoint-100/special_tokens_map.json b/checkpoint-100/special_tokens_map.json new file mode 100644 index 0000000..1d385d6 --- /dev/null +++ b/checkpoint-100/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-100/tokenizer.json b/checkpoint-100/tokenizer.json new file mode 100644 index 0000000..e7cd2c1 --- /dev/null +++ b/checkpoint-100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a +size 11422959 diff --git a/checkpoint-100/tokenizer_config.json b/checkpoint-100/tokenizer_config.json new file mode 100644 index 0000000..ef6e98c --- /dev/null +++ b/checkpoint-100/tokenizer_config.json @@ -0,0 +1,195 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json new file mode 100644 index 0000000..6590f9c --- /dev/null +++ b/checkpoint-100/trainer_state.json @@ -0,0 +1,2734 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.11428571428571428, + "eval_steps": 500, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1734.0, + "completions/mean_length": 1702.03125, + "completions/mean_terminated_length": 993.6190795898438, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.001142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2544386684894562, + "learning_rate": 0.0, + "loss": -0.0, + "num_tokens": 118418.0, + "reward": 0.17899775505065918, + "reward_std": 0.7650213241577148, + "rewards/cosine_scaled_reward/mean": -0.09800112992525101, + "rewards/cosine_scaled_reward/std": 0.37953105568885803, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.48795005679130554, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1894.0, + "completions/mean_length": 1738.90625, + "completions/mean_terminated_length": 949.0, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.002285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2436082512140274, + "learning_rate": 5e-08, + "loss": -0.0, + "num_tokens": 239748.0, + "reward": 0.3848632574081421, + "reward_std": 0.9111153483390808, + "rewards/cosine_scaled_reward/mean": 0.020556632429361343, + "rewards/cosine_scaled_reward/std": 0.4492928683757782, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1545.0, + "completions/mean_length": 1989.015625, + "completions/mean_terminated_length": 1104.25, + "completions/min_length": 706.0, + "completions/min_terminated_length": 706.0, + "epoch": 0.0034285714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2544717788696289, + "learning_rate": 1e-07, + "loss": -0.0, + "num_tokens": 377517.0, + "reward": -0.3279358148574829, + "reward_std": 0.33216947317123413, + "rewards/cosine_scaled_reward/mean": -0.20303040742874146, + "rewards/cosine_scaled_reward/std": 0.179075226187706, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.27048972249031067, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1566.421875, + "completions/mean_terminated_length": 1084.84375, + "completions/min_length": 502.0, + "completions/min_terminated_length": 502.0, + "epoch": 0.004571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28807103633880615, + "learning_rate": 1.5e-07, + "loss": -0.0, + "num_tokens": 487576.0, + "reward": 0.2716121971607208, + "reward_std": 0.6643469333648682, + "rewards/cosine_scaled_reward/mean": -0.12981891632080078, + "rewards/cosine_scaled_reward/std": 0.3019586503505707, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.5029674172401428, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1807.0, + "completions/mean_length": 1936.84375, + "completions/mean_terminated_length": 1031.71435546875, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "epoch": 0.005714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26783761382102966, + "learning_rate": 2e-07, + "loss": -0.0, + "num_tokens": 622350.0, + "reward": -0.3612896800041199, + "reward_std": 0.41048353910446167, + "rewards/cosine_scaled_reward/mean": -0.23533234000205994, + "rewards/cosine_scaled_reward/std": 0.20467400550842285, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.3145764470100403, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1301.0, + "completions/mean_length": 1889.453125, + "completions/mean_terminated_length": 779.625, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "epoch": 0.006857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.262518972158432, + "learning_rate": 2.5e-07, + "loss": 0.0, + "num_tokens": 754923.0, + "reward": -0.29250282049179077, + "reward_std": 0.5422531962394714, + "rewards/cosine_scaled_reward/mean": -0.22437641024589539, + "rewards/cosine_scaled_reward/std": 0.22509199380874634, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36596253514289856, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1568.0, + "completions/mean_length": 1921.921875, + "completions/mean_terminated_length": 1314.45458984375, + "completions/min_length": 927.0, + "completions/min_terminated_length": 927.0, + "epoch": 0.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22601397335529327, + "learning_rate": 3e-07, + "loss": 0.0, + "num_tokens": 888334.0, + "reward": 0.025340259075164795, + "reward_std": 0.7285393476486206, + "rewards/cosine_scaled_reward/mean": -0.1279548704624176, + "rewards/cosine_scaled_reward/std": 0.40222346782684326, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1736.859375, + "completions/mean_terminated_length": 999.9473876953125, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.009142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24552854895591736, + "learning_rate": 3.5e-07, + "loss": 0.0, + "num_tokens": 1009909.0, + "reward": 0.21729671955108643, + "reward_std": 0.6989120244979858, + "rewards/cosine_scaled_reward/mean": -0.055414143949747086, + "rewards/cosine_scaled_reward/std": 0.47493892908096313, + "rewards/format_reward/mean": 0.328125, + "rewards/format_reward/std": 0.4732423722743988, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 1967.53125, + "completions/mean_terminated_length": 1475.77783203125, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "epoch": 0.010285714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2430322915315628, + "learning_rate": 4e-07, + "loss": 0.0, + "num_tokens": 1147287.0, + "reward": -0.21451422572135925, + "reward_std": 0.587526798248291, + "rewards/cosine_scaled_reward/mean": -0.19319462776184082, + "rewards/cosine_scaled_reward/std": 0.29357606172561646, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.38025420904159546, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 1708.546875, + "completions/mean_terminated_length": 961.75, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.011428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2543582320213318, + "learning_rate": 4.5e-07, + "loss": 0.0, + "num_tokens": 1267466.0, + "reward": 0.02539752423763275, + "reward_std": 0.545810341835022, + "rewards/cosine_scaled_reward/mean": -0.14355123043060303, + "rewards/cosine_scaled_reward/std": 0.36147356033325195, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.467176616191864, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1579.0, + "completions/mean_length": 1967.734375, + "completions/mean_terminated_length": 1191.8333740234375, + "completions/min_length": 843.0, + "completions/min_terminated_length": 843.0, + "epoch": 0.012571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24583907425403595, + "learning_rate": 5e-07, + "loss": -0.0, + "num_tokens": 1405073.0, + "reward": -0.46971434354782104, + "reward_std": 0.36104393005371094, + "rewards/cosine_scaled_reward/mean": -0.28173214197158813, + "rewards/cosine_scaled_reward/std": 0.17775526642799377, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29378482699394226, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1862.0, + "completions/mean_length": 1707.5625, + "completions/mean_terminated_length": 1176.47998046875, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.013714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3135142922401428, + "learning_rate": 5.5e-07, + "loss": -0.0, + "num_tokens": 1525301.0, + "reward": 0.0018395520746707916, + "reward_std": 0.7012988328933716, + "rewards/cosine_scaled_reward/mean": -0.21783021092414856, + "rewards/cosine_scaled_reward/std": 0.324150949716568, + "rewards/format_reward/mean": 0.4375, + "rewards/format_reward/std": 0.5, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1745.0, + "completions/mean_length": 1841.96875, + "completions/mean_terminated_length": 1168.933349609375, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "epoch": 0.014857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2532394826412201, + "learning_rate": 6e-07, + "loss": -0.0, + "num_tokens": 1654227.0, + "reward": -0.10322706401348114, + "reward_std": 0.6915165185928345, + "rewards/cosine_scaled_reward/mean": -0.17661353945732117, + "rewards/cosine_scaled_reward/std": 0.329875111579895, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1816.390625, + "completions/mean_terminated_length": 1306.8499755859375, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "epoch": 0.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28405147790908813, + "learning_rate": 6.5e-07, + "loss": 0.0, + "num_tokens": 1781084.0, + "reward": 0.10602855682373047, + "reward_std": 0.630502462387085, + "rewards/cosine_scaled_reward/mean": -0.11104822158813477, + "rewards/cosine_scaled_reward/std": 0.3846627473831177, + "rewards/format_reward/mean": 0.328125, + "rewards/format_reward/std": 0.4732423722743988, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1574.0, + "completions/mean_length": 1702.109375, + "completions/mean_terminated_length": 818.1666870117188, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.017142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28779250383377075, + "learning_rate": 7e-07, + "loss": 0.0, + "num_tokens": 1900939.0, + "reward": 0.32734519243240356, + "reward_std": 0.3870265483856201, + "rewards/cosine_scaled_reward/mean": 0.007422588765621185, + "rewards/cosine_scaled_reward/std": 0.45787373185157776, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.467176616191864, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.018285714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2337152510881424, + "learning_rate": 7.5e-07, + "loss": -0.0, + "num_tokens": 2042451.0, + "reward": -0.5429925918579102, + "reward_std": 0.3153150975704193, + "rewards/cosine_scaled_reward/mean": -0.2714962661266327, + "rewards/cosine_scaled_reward/std": 0.1678173691034317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1879.0, + "completions/mean_length": 1564.921875, + "completions/mean_terminated_length": 858.8846435546875, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.019428571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33599403500556946, + "learning_rate": 8e-07, + "loss": -0.0, + "num_tokens": 2153126.0, + "reward": 0.17696775496006012, + "reward_std": 0.6489306688308716, + "rewards/cosine_scaled_reward/mean": -0.11464111506938934, + "rewards/cosine_scaled_reward/std": 0.3551919758319855, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.49501484632492065, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1298.0, + "completions/mean_length": 1795.390625, + "completions/mean_terminated_length": 893.21435546875, + "completions/min_length": 619.0, + "completions/min_terminated_length": 619.0, + "epoch": 0.02057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22697053849697113, + "learning_rate": 8.499999999999999e-07, + "loss": -0.0, + "num_tokens": 2278407.0, + "reward": -0.10711958259344101, + "reward_std": 0.5238703489303589, + "rewards/cosine_scaled_reward/mean": -0.1785597801208496, + "rewards/cosine_scaled_reward/std": 0.2545098662376404, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 1921.484375, + "completions/mean_terminated_length": 1238.300048828125, + "completions/min_length": 623.0, + "completions/min_terminated_length": 623.0, + "epoch": 0.021714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23972108960151672, + "learning_rate": 9e-07, + "loss": 0.0, + "num_tokens": 2412638.0, + "reward": 0.029344379901885986, + "reward_std": 0.6719281077384949, + "rewards/cosine_scaled_reward/mean": -0.086890310049057, + "rewards/cosine_scaled_reward/std": 0.40220555663108826, + "rewards/format_reward/mean": 0.203125, + "rewards/format_reward/std": 0.40550529956817627, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1728.5625, + "completions/mean_terminated_length": 845.4117431640625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.022857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23309311270713806, + "learning_rate": 9.499999999999999e-07, + "loss": 0.0, + "num_tokens": 2534618.0, + "reward": 0.0131673663854599, + "reward_std": 0.4436222314834595, + "rewards/cosine_scaled_reward/mean": -0.13404130935668945, + "rewards/cosine_scaled_reward/std": 0.32819250226020813, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 1777.953125, + "completions/mean_terminated_length": 1087.8333740234375, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29990270733833313, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 2659215.0, + "reward": -0.1764472872018814, + "reward_std": 0.5121938586235046, + "rewards/cosine_scaled_reward/mean": -0.2444736361503601, + "rewards/cosine_scaled_reward/std": 0.289971262216568, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.467176616191864, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 1361.28125, + "completions/mean_terminated_length": 921.0769653320312, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.025142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29922786355018616, + "learning_rate": 9.99931462820376e-07, + "loss": -0.0, + "num_tokens": 2755353.0, + "reward": 0.6089149713516235, + "reward_std": 0.5986809730529785, + "rewards/cosine_scaled_reward/mean": -0.05491749942302704, + "rewards/cosine_scaled_reward/std": 0.39076483249664307, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4531635046005249, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1565.046875, + "completions/mean_terminated_length": 903.2222290039062, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.026285714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27512773871421814, + "learning_rate": 9.997258721585931e-07, + "loss": -0.0, + "num_tokens": 2866308.0, + "reward": 0.21871733665466309, + "reward_std": 0.5976030826568604, + "rewards/cosine_scaled_reward/mean": -0.10157884657382965, + "rewards/cosine_scaled_reward/std": 0.3856185972690582, + "rewards/format_reward/mean": 0.421875, + "rewards/format_reward/std": 0.49776285886764526, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 1801.671875, + "completions/mean_terminated_length": 1259.75, + "completions/min_length": 573.0, + "completions/min_terminated_length": 573.0, + "epoch": 0.027428571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22642865777015686, + "learning_rate": 9.993832906395582e-07, + "loss": -0.0, + "num_tokens": 2992543.0, + "reward": 0.04899948835372925, + "reward_std": 0.8525694608688354, + "rewards/cosine_scaled_reward/mean": -0.17081275582313538, + "rewards/cosine_scaled_reward/std": 0.3993513882160187, + "rewards/format_reward/mean": 0.390625, + "rewards/format_reward/std": 0.4917473793029785, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1715.765625, + "completions/mean_terminated_length": 1035.4761962890625, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.02857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25316134095191956, + "learning_rate": 9.989038226169207e-07, + "loss": -0.0, + "num_tokens": 3112648.0, + "reward": 0.10585837811231613, + "reward_std": 0.7828943729400635, + "rewards/cosine_scaled_reward/mean": -0.11894579976797104, + "rewards/cosine_scaled_reward/std": 0.4141720235347748, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 1917.703125, + "completions/mean_terminated_length": 1452.357177734375, + "completions/min_length": 840.0, + "completions/min_terminated_length": 840.0, + "epoch": 0.029714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2521306574344635, + "learning_rate": 9.982876141412855e-07, + "loss": -0.0, + "num_tokens": 3246013.0, + "reward": 0.17620250582695007, + "reward_std": 0.6548349857330322, + "rewards/cosine_scaled_reward/mean": -0.08377375453710556, + "rewards/cosine_scaled_reward/std": 0.3527655303478241, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1851.015625, + "completions/mean_terminated_length": 1147.5, + "completions/min_length": 506.0, + "completions/min_terminated_length": 506.0, + "epoch": 0.030857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2730060815811157, + "learning_rate": 9.975348529157229e-07, + "loss": -0.0, + "num_tokens": 3374766.0, + "reward": -0.18854813277721405, + "reward_std": 0.49348777532577515, + "rewards/cosine_scaled_reward/mean": -0.21146157383918762, + "rewards/cosine_scaled_reward/std": 0.2601618766784668, + "rewards/format_reward/mean": 0.234375, + "rewards/format_reward/std": 0.42695629596710205, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1798.328125, + "completions/mean_terminated_length": 1049.3125, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2566036880016327, + "learning_rate": 9.96645768238595e-07, + "loss": 0.0, + "num_tokens": 3500195.0, + "reward": 0.06705980002880096, + "reward_std": 0.7090284824371338, + "rewards/cosine_scaled_reward/mean": -0.10709509253501892, + "rewards/cosine_scaled_reward/std": 0.4101051986217499, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1930.203125, + "completions/mean_terminated_length": 1210.3333740234375, + "completions/min_length": 582.0, + "completions/min_terminated_length": 582.0, + "epoch": 0.03314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25197461247444153, + "learning_rate": 9.956206309337066e-07, + "loss": 0.0, + "num_tokens": 3634200.0, + "reward": -0.2462695688009262, + "reward_std": 0.5237302780151367, + "rewards/cosine_scaled_reward/mean": -0.2012597918510437, + "rewards/cosine_scaled_reward/std": 0.23252712190151215, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36596253514289856, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1900.0, + "completions/mean_length": 1847.65625, + "completions/mean_terminated_length": 1061.6923828125, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "epoch": 0.03428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30431485176086426, + "learning_rate": 9.944597532678119e-07, + "loss": 0.0, + "num_tokens": 3762986.0, + "reward": -0.05392302945256233, + "reward_std": 0.7249555587768555, + "rewards/cosine_scaled_reward/mean": -0.15196150541305542, + "rewards/cosine_scaled_reward/std": 0.34566983580589294, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1860.0, + "completions/mean_length": 1838.671875, + "completions/mean_terminated_length": 931.5833740234375, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.03542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2484513372182846, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0, + "num_tokens": 3891157.0, + "reward": -0.11271396279335022, + "reward_std": 0.6705260872840881, + "rewards/cosine_scaled_reward/mean": -0.1813569962978363, + "rewards/cosine_scaled_reward/std": 0.4071698486804962, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1715.0, + "completions/mean_length": 1910.109375, + "completions/mean_terminated_length": 1417.6429443359375, + "completions/min_length": 906.0, + "completions/min_terminated_length": 906.0, + "epoch": 0.036571428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25329527258872986, + "learning_rate": 9.917322325514487e-07, + "loss": -0.0, + "num_tokens": 4023756.0, + "reward": -0.08931556344032288, + "reward_std": 0.6381070613861084, + "rewards/cosine_scaled_reward/mean": -0.16965776681900024, + "rewards/cosine_scaled_reward/std": 0.37385129928588867, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1865.0, + "completions/mean_length": 2023.71875, + "completions/mean_terminated_length": 1530.0, + "completions/min_length": 1107.0, + "completions/min_terminated_length": 1107.0, + "epoch": 0.037714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22758109867572784, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0, + "num_tokens": 4164490.0, + "reward": -0.4589868187904358, + "reward_std": 0.5177067518234253, + "rewards/cosine_scaled_reward/mean": -0.2919934093952179, + "rewards/cosine_scaled_reward/std": 0.2252870500087738, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3333333432674408, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1454.78125, + "completions/mean_terminated_length": 963.2571411132812, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.038857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3234354257583618, + "learning_rate": 9.88466529153356e-07, + "loss": 0.0, + "num_tokens": 4267148.0, + "reward": 0.656031608581543, + "reward_std": 0.7529654502868652, + "rewards/cosine_scaled_reward/mean": 0.05457830801606178, + "rewards/cosine_scaled_reward/std": 0.49684229493141174, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.501733124256134, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1724.0, + "completions/mean_length": 1819.078125, + "completions/mean_terminated_length": 716.0909423828125, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "epoch": 0.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2821458876132965, + "learning_rate": 9.866330768241983e-07, + "loss": -0.0, + "num_tokens": 4395065.0, + "reward": -0.09630556404590607, + "reward_std": 0.7089139223098755, + "rewards/cosine_scaled_reward/mean": -0.15752778947353363, + "rewards/cosine_scaled_reward/std": 0.3647947609424591, + "rewards/format_reward/mean": 0.21875, + "rewards/format_reward/std": 0.4166666865348816, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1811.0, + "completions/mean_length": 1954.34375, + "completions/mean_terminated_length": 1382.0, + "completions/min_length": 949.0, + "completions/min_terminated_length": 949.0, + "epoch": 0.04114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24163897335529327, + "learning_rate": 9.846666218300807e-07, + "loss": -0.0, + "num_tokens": 4531255.0, + "reward": -0.34593287110328674, + "reward_std": 0.44493502378463745, + "rewards/cosine_scaled_reward/mean": -0.24327893555164337, + "rewards/cosine_scaled_reward/std": 0.24784433841705322, + "rewards/format_reward/mean": 0.140625, + "rewards/format_reward/std": 0.3503824472427368, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1723.0, + "completions/mean_length": 1868.921875, + "completions/mean_terminated_length": 1092.916748046875, + "completions/min_length": 620.0, + "completions/min_terminated_length": 620.0, + "epoch": 0.04228571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24795544147491455, + "learning_rate": 9.825677631722435e-07, + "loss": -0.0, + "num_tokens": 4661890.0, + "reward": -0.23053905367851257, + "reward_std": 0.34036368131637573, + "rewards/cosine_scaled_reward/mean": -0.2246445268392563, + "rewards/cosine_scaled_reward/std": 0.15942412614822388, + "rewards/format_reward/mean": 0.21875, + "rewards/format_reward/std": 0.4166666865348816, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1397.0, + "completions/mean_length": 1889.53125, + "completions/mean_terminated_length": 1033.800048828125, + "completions/min_length": 810.0, + "completions/min_terminated_length": 810.0, + "epoch": 0.04342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24283826351165771, + "learning_rate": 9.80337140183366e-07, + "loss": 0.0, + "num_tokens": 4794532.0, + "reward": -0.10043507814407349, + "reward_std": 0.47925832867622375, + "rewards/cosine_scaled_reward/mean": -0.13615503907203674, + "rewards/cosine_scaled_reward/std": 0.3336707651615143, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.38025420904159546, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1515.0, + "completions/mean_length": 1644.828125, + "completions/mean_terminated_length": 689.9473876953125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.044571428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28362998366355896, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0, + "num_tokens": 4910585.0, + "reward": 0.12284853309392929, + "reward_std": 0.4183085858821869, + "rewards/cosine_scaled_reward/mean": -0.11045074462890625, + "rewards/cosine_scaled_reward/std": 0.30217844247817993, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1632.0, + "completions/mean_length": 1618.28125, + "completions/mean_terminated_length": 902.0833740234375, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.045714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.262617826461792, + "learning_rate": 9.754833590196926e-07, + "loss": 0.0, + "num_tokens": 5024227.0, + "reward": 0.2076582908630371, + "reward_std": 0.42125773429870605, + "rewards/cosine_scaled_reward/mean": -0.12273336946964264, + "rewards/cosine_scaled_reward/std": 0.4404613971710205, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.501733124256134, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1914.0, + "completions/mean_length": 1717.734375, + "completions/mean_terminated_length": 1235.0384521484375, + "completions/min_length": 664.0, + "completions/min_terminated_length": 664.0, + "epoch": 0.046857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23294499516487122, + "learning_rate": 9.728616793536587e-07, + "loss": -0.0, + "num_tokens": 5145314.0, + "reward": 0.011502981185913086, + "reward_std": 0.6816084980964661, + "rewards/cosine_scaled_reward/mean": -0.22081100940704346, + "rewards/cosine_scaled_reward/std": 0.37589573860168457, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.501733124256134, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1672.0, + "completions/mean_length": 1703.921875, + "completions/mean_terminated_length": 579.933349609375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34672290086746216, + "learning_rate": 9.701111919237408e-07, + "loss": -0.0, + "num_tokens": 5264725.0, + "reward": -0.2616002857685089, + "reward_std": 0.37952175736427307, + "rewards/cosine_scaled_reward/mean": -0.26361262798309326, + "rewards/cosine_scaled_reward/std": 0.17531204223632812, + "rewards/format_reward/mean": 0.265625, + "rewards/format_reward/std": 0.44515693187713623, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1370.0, + "completions/mean_length": 1681.84375, + "completions/mean_terminated_length": 814.631591796875, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.04914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.263967901468277, + "learning_rate": 9.672327345550543e-07, + "loss": -0.0, + "num_tokens": 5383979.0, + "reward": 0.13376155495643616, + "reward_std": 0.46012288331985474, + "rewards/cosine_scaled_reward/mean": -0.08155670762062073, + "rewards/cosine_scaled_reward/std": 0.3612325191497803, + "rewards/format_reward/mean": 0.296875, + "rewards/format_reward/std": 0.4604927599430084, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 1624.625, + "completions/mean_terminated_length": 869.9130859375, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.05028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28927963972091675, + "learning_rate": 9.64227184053598e-07, + "loss": -0.0, + "num_tokens": 5498651.0, + "reward": 0.20869271457195282, + "reward_std": 0.5558150410652161, + "rewards/cosine_scaled_reward/mean": -0.0987786278128624, + "rewards/cosine_scaled_reward/std": 0.42912590503692627, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.49501484632492065, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1851.0, + "completions/mean_length": 2006.96875, + "completions/mean_terminated_length": 1522.800048828125, + "completions/min_length": 955.0, + "completions/min_terminated_length": 955.0, + "epoch": 0.05142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24254000186920166, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0, + "num_tokens": 5638753.0, + "reward": -0.2540697157382965, + "reward_std": 0.4600578844547272, + "rewards/cosine_scaled_reward/mean": -0.20515984296798706, + "rewards/cosine_scaled_reward/std": 0.3251590430736542, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36596253514289856, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1563.0, + "completions/mean_length": 1765.984375, + "completions/mean_terminated_length": 919.9375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "epoch": 0.052571428571428575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2645930349826813, + "learning_rate": 9.578385041664925e-07, + "loss": 0.0, + "num_tokens": 5762944.0, + "reward": -0.213707834482193, + "reward_std": 0.38778313994407654, + "rewards/cosine_scaled_reward/mean": -0.2318539321422577, + "rewards/cosine_scaled_reward/std": 0.21436986327171326, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1583.40625, + "completions/mean_terminated_length": 986.0714721679688, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.053714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.311797559261322, + "learning_rate": 9.54457320834625e-07, + "loss": 0.0, + "num_tokens": 5874682.0, + "reward": 0.27925533056259155, + "reward_std": 0.6467443704605103, + "rewards/cosine_scaled_reward/mean": -0.07912233471870422, + "rewards/cosine_scaled_reward/std": 0.4737093150615692, + "rewards/format_reward/mean": 0.4375, + "rewards/format_reward/std": 0.5, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 1690.0625, + "completions/mean_terminated_length": 1006.727294921875, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.054857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26644304394721985, + "learning_rate": 9.509529358847654e-07, + "loss": -0.0, + "num_tokens": 5993390.0, + "reward": 0.13692031800746918, + "reward_std": 0.5655145049095154, + "rewards/cosine_scaled_reward/mean": -0.12685233354568481, + "rewards/cosine_scaled_reward/std": 0.32320985198020935, + "rewards/format_reward/mean": 0.390625, + "rewards/format_reward/std": 0.4917473793029785, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1387.140625, + "completions/mean_terminated_length": 804.0294189453125, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3078882396221161, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0, + "num_tokens": 6092231.0, + "reward": 0.35559189319610596, + "reward_std": 0.5927403569221497, + "rewards/cosine_scaled_reward/mean": -0.09564155340194702, + "rewards/cosine_scaled_reward/std": 0.4046906530857086, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.501733124256134, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 1674.890625, + "completions/mean_terminated_length": 962.5909423828125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.05714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23925544321537018, + "learning_rate": 9.43578868212728e-07, + "loss": -0.0, + "num_tokens": 6210240.0, + "reward": 0.18573230504989624, + "reward_std": 0.5264967083930969, + "rewards/cosine_scaled_reward/mean": -0.09463384002447128, + "rewards/cosine_scaled_reward/std": 0.4100942015647888, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.48795005679130554, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1347.40625, + "completions/mean_terminated_length": 836.1621704101562, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.05828571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.325811505317688, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0, + "num_tokens": 6306682.0, + "reward": 0.1735648661851883, + "reward_std": 0.5335988998413086, + "rewards/cosine_scaled_reward/mean": -0.21009255945682526, + "rewards/cosine_scaled_reward/std": 0.2623959481716156, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.49501484632492065, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1390.0, + "completions/mean_length": 1727.765625, + "completions/mean_terminated_length": 767.0625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.05942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27392977476119995, + "learning_rate": 9.357252853159505e-07, + "loss": 0.0, + "num_tokens": 6428611.0, + "reward": -0.16267812252044678, + "reward_std": 0.5682471990585327, + "rewards/cosine_scaled_reward/mean": -0.2219640612602234, + "rewards/cosine_scaled_reward/std": 0.36739134788513184, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1907.0, + "completions/mean_length": 1609.171875, + "completions/mean_terminated_length": 924.5999755859375, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "epoch": 0.060571428571428575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28155064582824707, + "learning_rate": 9.316216432703916e-07, + "loss": -0.0, + "num_tokens": 6542430.0, + "reward": 0.0752667784690857, + "reward_std": 0.7118167281150818, + "rewards/cosine_scaled_reward/mean": -0.18892911076545715, + "rewards/cosine_scaled_reward/std": 0.3222156763076782, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.501733124256134, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1588.234375, + "completions/mean_terminated_length": 1067.166748046875, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "epoch": 0.061714285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2555343806743622, + "learning_rate": 9.274017555754407e-07, + "loss": 0.0, + "num_tokens": 6655221.0, + "reward": 0.6341299414634705, + "reward_std": 1.0656921863555908, + "rewards/cosine_scaled_reward/mean": 0.05143994837999344, + "rewards/cosine_scaled_reward/std": 0.5348308086395264, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.5029674172401428, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1420.0, + "completions/mean_length": 1549.5625, + "completions/mean_terminated_length": 821.0769653320312, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "epoch": 0.06285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30243629217147827, + "learning_rate": 9.230669076497687e-07, + "loss": -0.0, + "num_tokens": 6764681.0, + "reward": 0.13021975755691528, + "reward_std": 0.3984764516353607, + "rewards/cosine_scaled_reward/mean": -0.13801513612270355, + "rewards/cosine_scaled_reward/std": 0.41228073835372925, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.49501484632492065, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1633.25, + "completions/mean_terminated_length": 1132.689697265625, + "completions/min_length": 543.0, + "completions/min_terminated_length": 543.0, + "epoch": 0.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23835402727127075, + "learning_rate": 9.186184199300463e-07, + "loss": -0.0, + "num_tokens": 6880169.0, + "reward": 0.27981996536254883, + "reward_std": 0.5018116235733032, + "rewards/cosine_scaled_reward/mean": -0.10227750986814499, + "rewards/cosine_scaled_reward/std": 0.481824666261673, + "rewards/format_reward/mean": 0.484375, + "rewards/format_reward/std": 0.5037065148353577, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1807.0, + "completions/mean_length": 1699.875, + "completions/mean_terminated_length": 1156.7999267578125, + "completions/min_length": 642.0, + "completions/min_terminated_length": 642.0, + "epoch": 0.06514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22349494695663452, + "learning_rate": 9.140576474687263e-07, + "loss": 0.0, + "num_tokens": 7000529.0, + "reward": -0.026505012065172195, + "reward_std": 0.5785415172576904, + "rewards/cosine_scaled_reward/mean": -0.20856501162052155, + "rewards/cosine_scaled_reward/std": 0.2749907374382019, + "rewards/format_reward/mean": 0.390625, + "rewards/format_reward/std": 0.4917473793029785, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1457.875, + "completions/mean_terminated_length": 1054.105224609375, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "epoch": 0.06628571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.261942595243454, + "learning_rate": 9.093859795212817e-07, + "loss": 0.0, + "num_tokens": 7103929.0, + "reward": 0.5745843648910522, + "reward_std": 0.8671218156814575, + "rewards/cosine_scaled_reward/mean": -0.03302033245563507, + "rewards/cosine_scaled_reward/std": 0.45529407262802124, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4836103618144989, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1590.0625, + "completions/mean_terminated_length": 1159.8787841796875, + "completions/min_length": 591.0, + "completions/min_terminated_length": 591.0, + "epoch": 0.06742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24828943610191345, + "learning_rate": 9.046048391230247e-07, + "loss": -0.0, + "num_tokens": 7216157.0, + "reward": 0.3377103805541992, + "reward_std": 0.5543617010116577, + "rewards/cosine_scaled_reward/mean": -0.1045822948217392, + "rewards/cosine_scaled_reward/std": 0.39040952920913696, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.501733124256134, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 1622.84375, + "completions/mean_terminated_length": 1076.21435546875, + "completions/min_length": 555.0, + "completions/min_terminated_length": 555.0, + "epoch": 0.06857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2752656936645508, + "learning_rate": 8.997156826556369e-07, + "loss": -0.0, + "num_tokens": 7330907.0, + "reward": 0.11114693433046341, + "reward_std": 0.6926254034042358, + "rewards/cosine_scaled_reward/mean": -0.1788015365600586, + "rewards/cosine_scaled_reward/std": 0.39409172534942627, + "rewards/format_reward/mean": 0.46875, + "rewards/format_reward/std": 0.5029674172401428, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1708.859375, + "completions/mean_terminated_length": 1014.4285888671875, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.06971428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22669929265975952, + "learning_rate": 8.9471999940354e-07, + "loss": -0.0, + "num_tokens": 7451794.0, + "reward": 0.2345120906829834, + "reward_std": 0.6293160319328308, + "rewards/cosine_scaled_reward/mean": -0.1093064472079277, + "rewards/cosine_scaled_reward/std": 0.29189831018447876, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.501733124256134, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1281.53125, + "completions/mean_terminated_length": 1004.2978515625, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.07085714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25438693165779114, + "learning_rate": 8.896193111002475e-07, + "loss": 0.0, + "num_tokens": 7544044.0, + "reward": 0.9180847406387329, + "reward_std": 0.6390912532806396, + "rewards/cosine_scaled_reward/mean": 0.06841734796762466, + "rewards/cosine_scaled_reward/std": 0.48315128684043884, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4166666865348816, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1678.0, + "completions/mean_length": 1310.46875, + "completions/mean_terminated_length": 896.731689453125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28795576095581055, + "learning_rate": 8.844151714648274e-07, + "loss": -0.0, + "num_tokens": 7638170.0, + "reward": 0.6488770246505737, + "reward_std": 0.7876260876655579, + "rewards/cosine_scaled_reward/mean": -0.019311510026454926, + "rewards/cosine_scaled_reward/std": 0.4736698865890503, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.467176616191864, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 1307.625, + "completions/mean_terminated_length": 1039.8297119140625, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.07314285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25637197494506836, + "learning_rate": 8.791091657286267e-07, + "loss": -0.0, + "num_tokens": 7732810.0, + "reward": 0.8280279636383057, + "reward_std": 0.6804471015930176, + "rewards/cosine_scaled_reward/mean": 0.015576483681797981, + "rewards/cosine_scaled_reward/std": 0.44819310307502747, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.40550529956817627, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1846.0, + "completions/mean_length": 1322.125, + "completions/mean_terminated_length": 914.9268188476562, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.07428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2944399118423462, + "learning_rate": 8.737029101523929e-07, + "loss": -0.0, + "num_tokens": 7828130.0, + "reward": 0.15610456466674805, + "reward_std": 0.4606686234474182, + "rewards/cosine_scaled_reward/mean": -0.24226020276546478, + "rewards/cosine_scaled_reward/std": 0.33131492137908936, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4836103618144989, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1803.0, + "completions/mean_length": 1020.21875, + "completions/mean_terminated_length": 806.9057006835938, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.07542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32644009590148926, + "learning_rate": 8.681980515339463e-07, + "loss": 0.0, + "num_tokens": 7903656.0, + "reward": 0.7972471714019775, + "reward_std": 0.7674820423126221, + "rewards/cosine_scaled_reward/mean": -0.031063925474882126, + "rewards/cosine_scaled_reward/std": 0.5106223225593567, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1910.0, + "completions/mean_length": 1750.859375, + "completions/mean_terminated_length": 1142.4285888671875, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "epoch": 0.07657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2270829975605011, + "learning_rate": 8.625962667065487e-07, + "loss": 0.0, + "num_tokens": 8026447.0, + "reward": -0.1400720775127411, + "reward_std": 0.3325888514518738, + "rewards/cosine_scaled_reward/mean": -0.24972353875637054, + "rewards/cosine_scaled_reward/std": 0.16404789686203003, + "rewards/format_reward/mean": 0.359375, + "rewards/format_reward/std": 0.4836103618144989, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1424.0, + "completions/mean_length": 769.546875, + "completions/mean_terminated_length": 637.2930908203125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.07771428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37025144696235657, + "learning_rate": 8.568992620281243e-07, + "loss": -0.0, + "num_tokens": 8084954.0, + "reward": 0.9792699813842773, + "reward_std": 0.804767370223999, + "rewards/cosine_scaled_reward/mean": 0.03651002421975136, + "rewards/cosine_scaled_reward/std": 0.46041443943977356, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1701.0, + "completions/mean_length": 1086.234375, + "completions/mean_terminated_length": 886.6226806640625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.07885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3763800263404846, + "learning_rate": 8.511087728614862e-07, + "loss": 0.0, + "num_tokens": 8164817.0, + "reward": 0.35803771018981934, + "reward_std": 0.5702667236328125, + "rewards/cosine_scaled_reward/mean": -0.24285613000392914, + "rewards/cosine_scaled_reward/std": 0.3019825220108032, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1898.0, + "completions/mean_length": 1463.375, + "completions/mean_terminated_length": 1112.5999755859375, + "completions/min_length": 503.0, + "completions/min_terminated_length": 503.0, + "epoch": 0.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24232418835163116, + "learning_rate": 8.452265630457282e-07, + "loss": -0.0, + "num_tokens": 8269929.0, + "reward": 0.3703588843345642, + "reward_std": 0.7288752794265747, + "rewards/cosine_scaled_reward/mean": -0.1351330280303955, + "rewards/cosine_scaled_reward/std": 0.3751916289329529, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4836103618144989, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1409.859375, + "completions/mean_terminated_length": 973.2368774414062, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.08114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.300010621547699, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0, + "num_tokens": 8370880.0, + "reward": 0.5196826457977295, + "reward_std": 0.7097917795181274, + "rewards/cosine_scaled_reward/mean": -0.044846177101135254, + "rewards/cosine_scaled_reward/std": 0.508389949798584, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4917473793029785, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1801.0, + "completions/mean_length": 1228.046875, + "completions/mean_terminated_length": 931.4680786132812, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.08228571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30454304814338684, + "learning_rate": 8.331941759724268e-07, + "loss": -0.0, + "num_tokens": 8459827.0, + "reward": 0.41365131735801697, + "reward_std": 0.5005639791488647, + "rewards/cosine_scaled_reward/mean": -0.1759868562221527, + "rewards/cosine_scaled_reward/std": 0.19868774712085724, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42695629596710205, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1719.0, + "completions/mean_length": 1513.28125, + "completions/mean_terminated_length": 1192.4500732421875, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "epoch": 0.08342857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27848970890045166, + "learning_rate": 8.270476638965461e-07, + "loss": -0.0, + "num_tokens": 8567405.0, + "reward": 0.09570223093032837, + "reward_std": 0.5445049405097961, + "rewards/cosine_scaled_reward/mean": -0.2802739143371582, + "rewards/cosine_scaled_reward/std": 0.25603488087654114, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4787135720252991, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1888.0, + "completions/mean_length": 1240.125, + "completions/mean_terminated_length": 924.0, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "epoch": 0.08457142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2789021134376526, + "learning_rate": 8.208167604184217e-07, + "loss": 0.0, + "num_tokens": 8656701.0, + "reward": 0.7823752760887146, + "reward_std": 0.6479132175445557, + "rewards/cosine_scaled_reward/mean": 0.031812600791454315, + "rewards/cosine_scaled_reward/std": 0.5397623181343079, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4531635046005249, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1455.953125, + "completions/mean_terminated_length": 1186.8409423828125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "epoch": 0.08571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22443196177482605, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0, + "num_tokens": 8760842.0, + "reward": 0.8040015697479248, + "reward_std": 0.5675323009490967, + "rewards/cosine_scaled_reward/mean": 0.027000809088349342, + "rewards/cosine_scaled_reward/std": 0.5096040964126587, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4364357888698578, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1720.0, + "completions/mean_length": 1177.859375, + "completions/mean_terminated_length": 863.1276245117188, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.08685714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32647648453712463, + "learning_rate": 8.081093963579707e-07, + "loss": 0.0, + "num_tokens": 8846625.0, + "reward": 0.310506671667099, + "reward_std": 0.5110941529273987, + "rewards/cosine_scaled_reward/mean": -0.2119341641664505, + "rewards/cosine_scaled_reward/std": 0.24737994372844696, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44515693187713623, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1787.0, + "completions/mean_length": 1263.4375, + "completions/mean_terminated_length": 1043.760009765625, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "epoch": 0.088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2545543611049652, + "learning_rate": 8.01636806561836e-07, + "loss": -0.0, + "num_tokens": 8939061.0, + "reward": 0.5484907031059265, + "reward_std": 0.48998576402664185, + "rewards/cosine_scaled_reward/mean": -0.13200464844703674, + "rewards/cosine_scaled_reward/std": 0.3430649936199188, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.39339789748191833, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1460.78125, + "completions/mean_terminated_length": 1059.0, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.08914285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2583931088447571, + "learning_rate": 7.950875657567621e-07, + "loss": 0.0, + "num_tokens": 9043271.0, + "reward": 0.6075442433357239, + "reward_std": 0.6895643472671509, + "rewards/cosine_scaled_reward/mean": -0.0009153857827186584, + "rewards/cosine_scaled_reward/std": 0.48922818899154663, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4917473793029785, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1956.0, + "completions/mean_length": 1054.875, + "completions/mean_terminated_length": 892.3635864257812, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.09028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29089078307151794, + "learning_rate": 7.884636689049422e-07, + "loss": 0.0, + "num_tokens": 9120879.0, + "reward": 0.6885831356048584, + "reward_std": 0.508629322052002, + "rewards/cosine_scaled_reward/mean": -0.09320840239524841, + "rewards/cosine_scaled_reward/std": 0.38835227489471436, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1399.046875, + "completions/mean_terminated_length": 1145.1087646484375, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "epoch": 0.09142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27458345890045166, + "learning_rate": 7.817671337095244e-07, + "loss": 0.0, + "num_tokens": 9220810.0, + "reward": 0.5549384355545044, + "reward_std": 0.7092134952545166, + "rewards/cosine_scaled_reward/mean": -0.09753081202507019, + "rewards/cosine_scaled_reward/std": 0.4125780463218689, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4364357888698578, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1833.0, + "completions/mean_length": 1084.984375, + "completions/mean_terminated_length": 906.6481323242188, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.09257142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37247684597969055, + "learning_rate": 7.75e-07, + "loss": -0.0, + "num_tokens": 9301521.0, + "reward": 0.5357480049133301, + "reward_std": 0.5661624670028687, + "rewards/cosine_scaled_reward/mean": -0.18525099754333496, + "rewards/cosine_scaled_reward/std": 0.3385297954082489, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1260.921875, + "completions/mean_terminated_length": 998.5625, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.09371428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27329322695732117, + "learning_rate": 7.681643291108517e-07, + "loss": -0.0, + "num_tokens": 9392548.0, + "reward": 0.9478914737701416, + "reward_std": 0.4313860237598419, + "rewards/cosine_scaled_reward/mean": 0.09894578158855438, + "rewards/cosine_scaled_reward/std": 0.5477120876312256, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4364357888698578, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1309.671875, + "completions/mean_terminated_length": 922.9285888671875, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.09485714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3202998638153076, + "learning_rate": 7.612622032536507e-07, + "loss": -0.0, + "num_tokens": 9487455.0, + "reward": 0.5201998949050903, + "reward_std": 0.6858996152877808, + "rewards/cosine_scaled_reward/mean": -0.09927503764629364, + "rewards/cosine_scaled_reward/std": 0.37909674644470215, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4531635046005249, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1685.0, + "completions/mean_length": 1185.703125, + "completions/mean_terminated_length": 965.9019775390625, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29646041989326477, + "learning_rate": 7.54295724882796e-07, + "loss": -0.0, + "num_tokens": 9574036.0, + "reward": 0.6779025793075562, + "reward_std": 0.557724118232727, + "rewards/cosine_scaled_reward/mean": -0.09073619544506073, + "rewards/cosine_scaled_reward/std": 0.3855368196964264, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1297.828125, + "completions/mean_terminated_length": 1158.907470703125, + "completions/min_length": 601.0, + "completions/min_terminated_length": 601.0, + "epoch": 0.09714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21307455003261566, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0, + "num_tokens": 9667417.0, + "reward": 0.5093189477920532, + "reward_std": 0.6006681323051453, + "rewards/cosine_scaled_reward/mean": -0.1672155261039734, + "rewards/cosine_scaled_reward/std": 0.34896284341812134, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1859.0, + "completions/mean_length": 1348.90625, + "completions/mean_terminated_length": 1096.04248046875, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "epoch": 0.09828571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2883393168449402, + "learning_rate": 7.401782177833147e-07, + "loss": -0.0, + "num_tokens": 9764603.0, + "reward": 0.8025823831558228, + "reward_std": 0.547119677066803, + "rewards/cosine_scaled_reward/mean": 0.01847870647907257, + "rewards/cosine_scaled_reward/std": 0.4346420168876648, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42695629596710205, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1782.0, + "completions/mean_length": 1086.96875, + "completions/mean_terminated_length": 909.0, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.09942857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31888866424560547, + "learning_rate": 7.330314893841101e-07, + "loss": -0.0, + "num_tokens": 9844289.0, + "reward": 0.5533354878425598, + "reward_std": 0.5319498777389526, + "rewards/cosine_scaled_reward/mean": -0.1530197560787201, + "rewards/cosine_scaled_reward/std": 0.2434682846069336, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 954.921875, + "completions/mean_terminated_length": 919.6612548828125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.10057142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3025936484336853, + "learning_rate": 7.258290078201731e-07, + "loss": -0.0, + "num_tokens": 9915916.0, + "reward": 1.2692296504974365, + "reward_std": 0.5115163326263428, + "rewards/cosine_scaled_reward/mean": 0.13461479544639587, + "rewards/cosine_scaled_reward/std": 0.506001353263855, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 1351.8125, + "completions/mean_terminated_length": 1174.35302734375, + "completions/min_length": 650.0, + "completions/min_terminated_length": 650.0, + "epoch": 0.10171428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23423585295677185, + "learning_rate": 7.185729670371604e-07, + "loss": -0.0, + "num_tokens": 10013432.0, + "reward": 0.724889874458313, + "reward_std": 0.7425336837768555, + "rewards/cosine_scaled_reward/mean": -0.0828675627708435, + "rewards/cosine_scaled_reward/std": 0.3893774449825287, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1906.0, + "completions/mean_length": 1153.28125, + "completions/mean_terminated_length": 1025.46435546875, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "epoch": 0.10285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3860023021697998, + "learning_rate": 7.11265577295385e-07, + "loss": -0.0, + "num_tokens": 10097242.0, + "reward": 0.5000253915786743, + "reward_std": 0.5103108286857605, + "rewards/cosine_scaled_reward/mean": -0.18748730421066284, + "rewards/cosine_scaled_reward/std": 0.2787182629108429, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1455.484375, + "completions/mean_terminated_length": 1166.1163330078125, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "epoch": 0.104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2551063895225525, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0, + "num_tokens": 10200961.0, + "reward": 0.4053259789943695, + "reward_std": 0.663999617099762, + "rewards/cosine_scaled_reward/mean": -0.18796202540397644, + "rewards/cosine_scaled_reward/std": 0.35777655243873596, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4166666865348816, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1176.953125, + "completions/mean_terminated_length": 1015.6481323242188, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "epoch": 0.10514285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27449366450309753, + "learning_rate": 6.965056695057204e-07, + "loss": -0.0, + "num_tokens": 10286278.0, + "reward": 0.5743436217308044, + "reward_std": 0.6229422092437744, + "rewards/cosine_scaled_reward/mean": -0.15032817423343658, + "rewards/cosine_scaled_reward/std": 0.2899566888809204, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1434.875, + "completions/mean_terminated_length": 1156.181884765625, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.10628571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2839376926422119, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0, + "num_tokens": 10389454.0, + "reward": 0.30658647418022156, + "reward_std": 0.5343226194381714, + "rewards/cosine_scaled_reward/mean": -0.22951926290988922, + "rewards/cosine_scaled_reward/std": 0.2324177473783493, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42695629596710205, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1684.0, + "completions/mean_length": 1242.390625, + "completions/mean_terminated_length": 927.1522216796875, + "completions/min_length": 508.0, + "completions/min_terminated_length": 508.0, + "epoch": 0.10742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2985072433948517, + "learning_rate": 6.815672671252315e-07, + "loss": 0.0, + "num_tokens": 10478735.0, + "reward": 0.6593698263168335, + "reward_std": 0.5845412015914917, + "rewards/cosine_scaled_reward/mean": -0.02969011664390564, + "rewards/cosine_scaled_reward/std": 0.47056320309638977, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4531635046005249, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1931.0, + "completions/mean_length": 1203.265625, + "completions/mean_terminated_length": 1082.58935546875, + "completions/min_length": 573.0, + "completions/min_terminated_length": 573.0, + "epoch": 0.10857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2689598798751831, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0, + "num_tokens": 10566272.0, + "reward": 0.4301251173019409, + "reward_std": 0.4795047640800476, + "rewards/cosine_scaled_reward/mean": -0.22243742644786835, + "rewards/cosine_scaled_reward/std": 0.2575407326221466, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1827.0, + "completions/mean_length": 1205.5625, + "completions/mean_terminated_length": 990.8235473632812, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.10971428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30502915382385254, + "learning_rate": 6.664685702961344e-07, + "loss": -0.0, + "num_tokens": 10654564.0, + "reward": 0.896080493927002, + "reward_std": 0.6987663507461548, + "rewards/cosine_scaled_reward/mean": 0.02616523765027523, + "rewards/cosine_scaled_reward/std": 0.460237056016922, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1777.0, + "completions/mean_length": 1170.390625, + "completions/mean_terminated_length": 988.2453002929688, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.11085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3103901743888855, + "learning_rate": 6.588648530198504e-07, + "loss": -0.0, + "num_tokens": 10739733.0, + "reward": 0.6633297204971313, + "reward_std": 0.609075665473938, + "rewards/cosine_scaled_reward/mean": -0.12927262485027313, + "rewards/cosine_scaled_reward/std": 0.4114542305469513, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1817.0, + "completions/mean_length": 1136.5625, + "completions/mean_terminated_length": 947.396240234375, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "epoch": 0.112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2510873079299927, + "learning_rate": 6.512279744547392e-07, + "loss": 0.0, + "num_tokens": 10823537.0, + "reward": 0.6613268256187439, + "reward_std": 0.4785424768924713, + "rewards/cosine_scaled_reward/mean": -0.09902409464120865, + "rewards/cosine_scaled_reward/std": 0.4345317482948303, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1171.8125, + "completions/mean_terminated_length": 1081.17236328125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.11314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.281054824590683, + "learning_rate": 6.435602608679916e-07, + "loss": -0.0, + "num_tokens": 10909701.0, + "reward": 1.0416245460510254, + "reward_std": 0.6949809789657593, + "rewards/cosine_scaled_reward/mean": 0.0520622618496418, + "rewards/cosine_scaled_reward/std": 0.508481502532959, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24397502839565277, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 1120.8125, + "completions/mean_terminated_length": 1024.8966064453125, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.11428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2910788655281067, + "learning_rate": 6.358640479194451e-07, + "loss": 0.0, + "num_tokens": 10991145.0, + "reward": 1.2036188840866089, + "reward_std": 0.8533884286880493, + "rewards/cosine_scaled_reward/mean": 0.14087192714214325, + "rewards/cosine_scaled_reward/std": 0.5375887751579285, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 100 + } + ], + "logging_steps": 1, + "max_steps": 200, + "num_input_tokens_seen": 10991145, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin new file mode 100644 index 0000000..9e03ee7 --- /dev/null +++ b/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec4fbe7e29ae78abab9c9fd5555870c1bffe7656ceef4ac5fa2148a15b61b1e3 +size 8888 diff --git a/checkpoint-100/zero_to_fp32.py b/checkpoint-100/zero_to_fp32.py new file mode 100644 index 0000000..0e75914 --- /dev/null +++ b/checkpoint-100/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-150/config.json b/checkpoint-150/config.json new file mode 100644 index 0000000..78fed5b --- /dev/null +++ b/checkpoint-150/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": 4096, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "use_cache": false, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/checkpoint-150/generation_config.json b/checkpoint-150/generation_config.json new file mode 100644 index 0000000..92878bd --- /dev/null +++ b/checkpoint-150/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 151646, + "do_sample": true, + "eos_token_id": 151643, + "temperature": 0.6, + "top_p": 0.95, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-150/global_step150/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-150/global_step150/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..13300da --- /dev/null +++ b/checkpoint-150/global_step150/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8263fa53cf56652267f39b5f0f25269cd883d7dd51a99633fd6cf76b7be76642 +size 5331274140 diff --git a/checkpoint-150/global_step150/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-150/global_step150/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..8e48238 --- /dev/null +++ b/checkpoint-150/global_step150/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff2755e67ca872d954992952904fa6aa172ed763e5dc041dcb6358e9ae584431 +size 5331276572 diff --git a/checkpoint-150/global_step150/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-150/global_step150/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..0d06338 --- /dev/null +++ b/checkpoint-150/global_step150/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db406d943ceefc64f936cccc352f18706c5b7067190422d4dc9b05b29ecd4adb +size 5331276892 diff --git a/checkpoint-150/global_step150/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-150/global_step150/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..fa196ed --- /dev/null +++ b/checkpoint-150/global_step150/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c8b0281b6b31e755fd279fdb11ddff5ad0366bb628bd0c6f686a0a5cb9fe9b9 +size 5331273884 diff --git a/checkpoint-150/global_step150/mp_rank_00_model_states.pt b/checkpoint-150/global_step150/mp_rank_00_model_states.pt new file mode 100644 index 0000000..f6ec2c2 --- /dev/null +++ b/checkpoint-150/global_step150/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd5b89bece86d35e0971e0feffeba21aedbf60967650a4c8721e2e1a63f04a72 +size 3554267640 diff --git a/checkpoint-150/latest b/checkpoint-150/latest new file mode 100644 index 0000000..daf5be2 --- /dev/null +++ b/checkpoint-150/latest @@ -0,0 +1 @@ +global_step150 \ No newline at end of file diff --git a/checkpoint-150/model.safetensors b/checkpoint-150/model.safetensors new file mode 100644 index 0000000..204e87c --- /dev/null +++ b/checkpoint-150/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39aa333d4909dd2e57b05571e5c8a954a5813ad4f503a72afa6c10497b5f51e8 +size 3554214752 diff --git a/checkpoint-150/rng_state_0.pth b/checkpoint-150/rng_state_0.pth new file mode 100644 index 0000000..408a873 --- /dev/null +++ b/checkpoint-150/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96d867261441e1e1663c114f0b6b75f90d9ae6dcdb00127b0edf349cc603361b +size 14960 diff --git a/checkpoint-150/rng_state_1.pth b/checkpoint-150/rng_state_1.pth new file mode 100644 index 0000000..c6550cc --- /dev/null +++ b/checkpoint-150/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:626697b4919a36bf118090898dd7854d355fe644652e7e94c1d8164c54600ce7 +size 14960 diff --git a/checkpoint-150/rng_state_2.pth b/checkpoint-150/rng_state_2.pth new file mode 100644 index 0000000..3926a7e --- /dev/null +++ b/checkpoint-150/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b78f639ba401ba28686266f62024e2c2c981bcc01eed221e9ab333057c17938e +size 14960 diff --git a/checkpoint-150/rng_state_3.pth b/checkpoint-150/rng_state_3.pth new file mode 100644 index 0000000..0a74a91 --- /dev/null +++ b/checkpoint-150/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4875d2494078536392718e5bfe242a49f416663b092d0de6bae0f1b7dccaf452 +size 14960 diff --git a/checkpoint-150/scheduler.pt b/checkpoint-150/scheduler.pt new file mode 100644 index 0000000..dd6d7b3 --- /dev/null +++ b/checkpoint-150/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:652df553f3c9234b0e74dc48466c4ec0ca48a5ae2c6acd4d2c81ea9542491c84 +size 1064 diff --git a/checkpoint-150/special_tokens_map.json b/checkpoint-150/special_tokens_map.json new file mode 100644 index 0000000..1d385d6 --- /dev/null +++ b/checkpoint-150/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-150/tokenizer.json b/checkpoint-150/tokenizer.json new file mode 100644 index 0000000..e7cd2c1 --- /dev/null +++ b/checkpoint-150/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a +size 11422959 diff --git a/checkpoint-150/tokenizer_config.json b/checkpoint-150/tokenizer_config.json new file mode 100644 index 0000000..ef6e98c --- /dev/null +++ b/checkpoint-150/tokenizer_config.json @@ -0,0 +1,195 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/checkpoint-150/trainer_state.json b/checkpoint-150/trainer_state.json new file mode 100644 index 0000000..2fe6193 --- /dev/null +++ b/checkpoint-150/trainer_state.json @@ -0,0 +1,4084 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.17142857142857143, + "eval_steps": 500, + "global_step": 150, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1734.0, + "completions/mean_length": 1702.03125, + "completions/mean_terminated_length": 993.6190795898438, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.001142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2544386684894562, + "learning_rate": 0.0, + "loss": -0.0, + "num_tokens": 118418.0, + "reward": 0.17899775505065918, + "reward_std": 0.7650213241577148, + "rewards/cosine_scaled_reward/mean": -0.09800112992525101, + "rewards/cosine_scaled_reward/std": 0.37953105568885803, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.48795005679130554, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1894.0, + "completions/mean_length": 1738.90625, + "completions/mean_terminated_length": 949.0, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.002285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2436082512140274, + "learning_rate": 5e-08, + "loss": -0.0, + "num_tokens": 239748.0, + "reward": 0.3848632574081421, + "reward_std": 0.9111153483390808, + "rewards/cosine_scaled_reward/mean": 0.020556632429361343, + "rewards/cosine_scaled_reward/std": 0.4492928683757782, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1545.0, + "completions/mean_length": 1989.015625, + "completions/mean_terminated_length": 1104.25, + "completions/min_length": 706.0, + "completions/min_terminated_length": 706.0, + "epoch": 0.0034285714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2544717788696289, + "learning_rate": 1e-07, + "loss": -0.0, + "num_tokens": 377517.0, + "reward": -0.3279358148574829, + "reward_std": 0.33216947317123413, + "rewards/cosine_scaled_reward/mean": -0.20303040742874146, + "rewards/cosine_scaled_reward/std": 0.179075226187706, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.27048972249031067, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1566.421875, + "completions/mean_terminated_length": 1084.84375, + "completions/min_length": 502.0, + "completions/min_terminated_length": 502.0, + "epoch": 0.004571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28807103633880615, + "learning_rate": 1.5e-07, + "loss": -0.0, + "num_tokens": 487576.0, + "reward": 0.2716121971607208, + "reward_std": 0.6643469333648682, + "rewards/cosine_scaled_reward/mean": -0.12981891632080078, + "rewards/cosine_scaled_reward/std": 0.3019586503505707, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.5029674172401428, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1807.0, + "completions/mean_length": 1936.84375, + "completions/mean_terminated_length": 1031.71435546875, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "epoch": 0.005714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26783761382102966, + "learning_rate": 2e-07, + "loss": -0.0, + "num_tokens": 622350.0, + "reward": -0.3612896800041199, + "reward_std": 0.41048353910446167, + "rewards/cosine_scaled_reward/mean": -0.23533234000205994, + "rewards/cosine_scaled_reward/std": 0.20467400550842285, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.3145764470100403, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1301.0, + "completions/mean_length": 1889.453125, + "completions/mean_terminated_length": 779.625, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "epoch": 0.006857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.262518972158432, + "learning_rate": 2.5e-07, + "loss": 0.0, + "num_tokens": 754923.0, + "reward": -0.29250282049179077, + "reward_std": 0.5422531962394714, + "rewards/cosine_scaled_reward/mean": -0.22437641024589539, + "rewards/cosine_scaled_reward/std": 0.22509199380874634, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36596253514289856, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1568.0, + "completions/mean_length": 1921.921875, + "completions/mean_terminated_length": 1314.45458984375, + "completions/min_length": 927.0, + "completions/min_terminated_length": 927.0, + "epoch": 0.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22601397335529327, + "learning_rate": 3e-07, + "loss": 0.0, + "num_tokens": 888334.0, + "reward": 0.025340259075164795, + "reward_std": 0.7285393476486206, + "rewards/cosine_scaled_reward/mean": -0.1279548704624176, + "rewards/cosine_scaled_reward/std": 0.40222346782684326, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1736.859375, + "completions/mean_terminated_length": 999.9473876953125, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.009142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24552854895591736, + "learning_rate": 3.5e-07, + "loss": 0.0, + "num_tokens": 1009909.0, + "reward": 0.21729671955108643, + "reward_std": 0.6989120244979858, + "rewards/cosine_scaled_reward/mean": -0.055414143949747086, + "rewards/cosine_scaled_reward/std": 0.47493892908096313, + "rewards/format_reward/mean": 0.328125, + "rewards/format_reward/std": 0.4732423722743988, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 1967.53125, + "completions/mean_terminated_length": 1475.77783203125, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "epoch": 0.010285714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2430322915315628, + "learning_rate": 4e-07, + "loss": 0.0, + "num_tokens": 1147287.0, + "reward": -0.21451422572135925, + "reward_std": 0.587526798248291, + "rewards/cosine_scaled_reward/mean": -0.19319462776184082, + "rewards/cosine_scaled_reward/std": 0.29357606172561646, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.38025420904159546, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 1708.546875, + "completions/mean_terminated_length": 961.75, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.011428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2543582320213318, + "learning_rate": 4.5e-07, + "loss": 0.0, + "num_tokens": 1267466.0, + "reward": 0.02539752423763275, + "reward_std": 0.545810341835022, + "rewards/cosine_scaled_reward/mean": -0.14355123043060303, + "rewards/cosine_scaled_reward/std": 0.36147356033325195, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.467176616191864, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1579.0, + "completions/mean_length": 1967.734375, + "completions/mean_terminated_length": 1191.8333740234375, + "completions/min_length": 843.0, + "completions/min_terminated_length": 843.0, + "epoch": 0.012571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24583907425403595, + "learning_rate": 5e-07, + "loss": -0.0, + "num_tokens": 1405073.0, + "reward": -0.46971434354782104, + "reward_std": 0.36104393005371094, + "rewards/cosine_scaled_reward/mean": -0.28173214197158813, + "rewards/cosine_scaled_reward/std": 0.17775526642799377, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29378482699394226, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1862.0, + "completions/mean_length": 1707.5625, + "completions/mean_terminated_length": 1176.47998046875, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.013714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3135142922401428, + "learning_rate": 5.5e-07, + "loss": -0.0, + "num_tokens": 1525301.0, + "reward": 0.0018395520746707916, + "reward_std": 0.7012988328933716, + "rewards/cosine_scaled_reward/mean": -0.21783021092414856, + "rewards/cosine_scaled_reward/std": 0.324150949716568, + "rewards/format_reward/mean": 0.4375, + "rewards/format_reward/std": 0.5, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1745.0, + "completions/mean_length": 1841.96875, + "completions/mean_terminated_length": 1168.933349609375, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "epoch": 0.014857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2532394826412201, + "learning_rate": 6e-07, + "loss": -0.0, + "num_tokens": 1654227.0, + "reward": -0.10322706401348114, + "reward_std": 0.6915165185928345, + "rewards/cosine_scaled_reward/mean": -0.17661353945732117, + "rewards/cosine_scaled_reward/std": 0.329875111579895, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1816.390625, + "completions/mean_terminated_length": 1306.8499755859375, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "epoch": 0.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28405147790908813, + "learning_rate": 6.5e-07, + "loss": 0.0, + "num_tokens": 1781084.0, + "reward": 0.10602855682373047, + "reward_std": 0.630502462387085, + "rewards/cosine_scaled_reward/mean": -0.11104822158813477, + "rewards/cosine_scaled_reward/std": 0.3846627473831177, + "rewards/format_reward/mean": 0.328125, + "rewards/format_reward/std": 0.4732423722743988, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1574.0, + "completions/mean_length": 1702.109375, + "completions/mean_terminated_length": 818.1666870117188, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.017142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28779250383377075, + "learning_rate": 7e-07, + "loss": 0.0, + "num_tokens": 1900939.0, + "reward": 0.32734519243240356, + "reward_std": 0.3870265483856201, + "rewards/cosine_scaled_reward/mean": 0.007422588765621185, + "rewards/cosine_scaled_reward/std": 0.45787373185157776, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.467176616191864, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.018285714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2337152510881424, + "learning_rate": 7.5e-07, + "loss": -0.0, + "num_tokens": 2042451.0, + "reward": -0.5429925918579102, + "reward_std": 0.3153150975704193, + "rewards/cosine_scaled_reward/mean": -0.2714962661266327, + "rewards/cosine_scaled_reward/std": 0.1678173691034317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1879.0, + "completions/mean_length": 1564.921875, + "completions/mean_terminated_length": 858.8846435546875, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.019428571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33599403500556946, + "learning_rate": 8e-07, + "loss": -0.0, + "num_tokens": 2153126.0, + "reward": 0.17696775496006012, + "reward_std": 0.6489306688308716, + "rewards/cosine_scaled_reward/mean": -0.11464111506938934, + "rewards/cosine_scaled_reward/std": 0.3551919758319855, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.49501484632492065, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1298.0, + "completions/mean_length": 1795.390625, + "completions/mean_terminated_length": 893.21435546875, + "completions/min_length": 619.0, + "completions/min_terminated_length": 619.0, + "epoch": 0.02057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22697053849697113, + "learning_rate": 8.499999999999999e-07, + "loss": -0.0, + "num_tokens": 2278407.0, + "reward": -0.10711958259344101, + "reward_std": 0.5238703489303589, + "rewards/cosine_scaled_reward/mean": -0.1785597801208496, + "rewards/cosine_scaled_reward/std": 0.2545098662376404, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 1921.484375, + "completions/mean_terminated_length": 1238.300048828125, + "completions/min_length": 623.0, + "completions/min_terminated_length": 623.0, + "epoch": 0.021714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23972108960151672, + "learning_rate": 9e-07, + "loss": 0.0, + "num_tokens": 2412638.0, + "reward": 0.029344379901885986, + "reward_std": 0.6719281077384949, + "rewards/cosine_scaled_reward/mean": -0.086890310049057, + "rewards/cosine_scaled_reward/std": 0.40220555663108826, + "rewards/format_reward/mean": 0.203125, + "rewards/format_reward/std": 0.40550529956817627, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1728.5625, + "completions/mean_terminated_length": 845.4117431640625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.022857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23309311270713806, + "learning_rate": 9.499999999999999e-07, + "loss": 0.0, + "num_tokens": 2534618.0, + "reward": 0.0131673663854599, + "reward_std": 0.4436222314834595, + "rewards/cosine_scaled_reward/mean": -0.13404130935668945, + "rewards/cosine_scaled_reward/std": 0.32819250226020813, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 1777.953125, + "completions/mean_terminated_length": 1087.8333740234375, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29990270733833313, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 2659215.0, + "reward": -0.1764472872018814, + "reward_std": 0.5121938586235046, + "rewards/cosine_scaled_reward/mean": -0.2444736361503601, + "rewards/cosine_scaled_reward/std": 0.289971262216568, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.467176616191864, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 1361.28125, + "completions/mean_terminated_length": 921.0769653320312, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.025142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29922786355018616, + "learning_rate": 9.99931462820376e-07, + "loss": -0.0, + "num_tokens": 2755353.0, + "reward": 0.6089149713516235, + "reward_std": 0.5986809730529785, + "rewards/cosine_scaled_reward/mean": -0.05491749942302704, + "rewards/cosine_scaled_reward/std": 0.39076483249664307, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4531635046005249, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1565.046875, + "completions/mean_terminated_length": 903.2222290039062, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.026285714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27512773871421814, + "learning_rate": 9.997258721585931e-07, + "loss": -0.0, + "num_tokens": 2866308.0, + "reward": 0.21871733665466309, + "reward_std": 0.5976030826568604, + "rewards/cosine_scaled_reward/mean": -0.10157884657382965, + "rewards/cosine_scaled_reward/std": 0.3856185972690582, + "rewards/format_reward/mean": 0.421875, + "rewards/format_reward/std": 0.49776285886764526, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 1801.671875, + "completions/mean_terminated_length": 1259.75, + "completions/min_length": 573.0, + "completions/min_terminated_length": 573.0, + "epoch": 0.027428571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22642865777015686, + "learning_rate": 9.993832906395582e-07, + "loss": -0.0, + "num_tokens": 2992543.0, + "reward": 0.04899948835372925, + "reward_std": 0.8525694608688354, + "rewards/cosine_scaled_reward/mean": -0.17081275582313538, + "rewards/cosine_scaled_reward/std": 0.3993513882160187, + "rewards/format_reward/mean": 0.390625, + "rewards/format_reward/std": 0.4917473793029785, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1715.765625, + "completions/mean_terminated_length": 1035.4761962890625, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.02857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25316134095191956, + "learning_rate": 9.989038226169207e-07, + "loss": -0.0, + "num_tokens": 3112648.0, + "reward": 0.10585837811231613, + "reward_std": 0.7828943729400635, + "rewards/cosine_scaled_reward/mean": -0.11894579976797104, + "rewards/cosine_scaled_reward/std": 0.4141720235347748, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 1917.703125, + "completions/mean_terminated_length": 1452.357177734375, + "completions/min_length": 840.0, + "completions/min_terminated_length": 840.0, + "epoch": 0.029714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2521306574344635, + "learning_rate": 9.982876141412855e-07, + "loss": -0.0, + "num_tokens": 3246013.0, + "reward": 0.17620250582695007, + "reward_std": 0.6548349857330322, + "rewards/cosine_scaled_reward/mean": -0.08377375453710556, + "rewards/cosine_scaled_reward/std": 0.3527655303478241, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1851.015625, + "completions/mean_terminated_length": 1147.5, + "completions/min_length": 506.0, + "completions/min_terminated_length": 506.0, + "epoch": 0.030857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2730060815811157, + "learning_rate": 9.975348529157229e-07, + "loss": -0.0, + "num_tokens": 3374766.0, + "reward": -0.18854813277721405, + "reward_std": 0.49348777532577515, + "rewards/cosine_scaled_reward/mean": -0.21146157383918762, + "rewards/cosine_scaled_reward/std": 0.2601618766784668, + "rewards/format_reward/mean": 0.234375, + "rewards/format_reward/std": 0.42695629596710205, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1798.328125, + "completions/mean_terminated_length": 1049.3125, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2566036880016327, + "learning_rate": 9.96645768238595e-07, + "loss": 0.0, + "num_tokens": 3500195.0, + "reward": 0.06705980002880096, + "reward_std": 0.7090284824371338, + "rewards/cosine_scaled_reward/mean": -0.10709509253501892, + "rewards/cosine_scaled_reward/std": 0.4101051986217499, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1930.203125, + "completions/mean_terminated_length": 1210.3333740234375, + "completions/min_length": 582.0, + "completions/min_terminated_length": 582.0, + "epoch": 0.03314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25197461247444153, + "learning_rate": 9.956206309337066e-07, + "loss": 0.0, + "num_tokens": 3634200.0, + "reward": -0.2462695688009262, + "reward_std": 0.5237302780151367, + "rewards/cosine_scaled_reward/mean": -0.2012597918510437, + "rewards/cosine_scaled_reward/std": 0.23252712190151215, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36596253514289856, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1900.0, + "completions/mean_length": 1847.65625, + "completions/mean_terminated_length": 1061.6923828125, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "epoch": 0.03428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30431485176086426, + "learning_rate": 9.944597532678119e-07, + "loss": 0.0, + "num_tokens": 3762986.0, + "reward": -0.05392302945256233, + "reward_std": 0.7249555587768555, + "rewards/cosine_scaled_reward/mean": -0.15196150541305542, + "rewards/cosine_scaled_reward/std": 0.34566983580589294, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1860.0, + "completions/mean_length": 1838.671875, + "completions/mean_terminated_length": 931.5833740234375, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.03542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2484513372182846, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0, + "num_tokens": 3891157.0, + "reward": -0.11271396279335022, + "reward_std": 0.6705260872840881, + "rewards/cosine_scaled_reward/mean": -0.1813569962978363, + "rewards/cosine_scaled_reward/std": 0.4071698486804962, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1715.0, + "completions/mean_length": 1910.109375, + "completions/mean_terminated_length": 1417.6429443359375, + "completions/min_length": 906.0, + "completions/min_terminated_length": 906.0, + "epoch": 0.036571428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25329527258872986, + "learning_rate": 9.917322325514487e-07, + "loss": -0.0, + "num_tokens": 4023756.0, + "reward": -0.08931556344032288, + "reward_std": 0.6381070613861084, + "rewards/cosine_scaled_reward/mean": -0.16965776681900024, + "rewards/cosine_scaled_reward/std": 0.37385129928588867, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1865.0, + "completions/mean_length": 2023.71875, + "completions/mean_terminated_length": 1530.0, + "completions/min_length": 1107.0, + "completions/min_terminated_length": 1107.0, + "epoch": 0.037714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22758109867572784, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0, + "num_tokens": 4164490.0, + "reward": -0.4589868187904358, + "reward_std": 0.5177067518234253, + "rewards/cosine_scaled_reward/mean": -0.2919934093952179, + "rewards/cosine_scaled_reward/std": 0.2252870500087738, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3333333432674408, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1454.78125, + "completions/mean_terminated_length": 963.2571411132812, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.038857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3234354257583618, + "learning_rate": 9.88466529153356e-07, + "loss": 0.0, + "num_tokens": 4267148.0, + "reward": 0.656031608581543, + "reward_std": 0.7529654502868652, + "rewards/cosine_scaled_reward/mean": 0.05457830801606178, + "rewards/cosine_scaled_reward/std": 0.49684229493141174, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.501733124256134, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1724.0, + "completions/mean_length": 1819.078125, + "completions/mean_terminated_length": 716.0909423828125, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "epoch": 0.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2821458876132965, + "learning_rate": 9.866330768241983e-07, + "loss": -0.0, + "num_tokens": 4395065.0, + "reward": -0.09630556404590607, + "reward_std": 0.7089139223098755, + "rewards/cosine_scaled_reward/mean": -0.15752778947353363, + "rewards/cosine_scaled_reward/std": 0.3647947609424591, + "rewards/format_reward/mean": 0.21875, + "rewards/format_reward/std": 0.4166666865348816, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1811.0, + "completions/mean_length": 1954.34375, + "completions/mean_terminated_length": 1382.0, + "completions/min_length": 949.0, + "completions/min_terminated_length": 949.0, + "epoch": 0.04114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24163897335529327, + "learning_rate": 9.846666218300807e-07, + "loss": -0.0, + "num_tokens": 4531255.0, + "reward": -0.34593287110328674, + "reward_std": 0.44493502378463745, + "rewards/cosine_scaled_reward/mean": -0.24327893555164337, + "rewards/cosine_scaled_reward/std": 0.24784433841705322, + "rewards/format_reward/mean": 0.140625, + "rewards/format_reward/std": 0.3503824472427368, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1723.0, + "completions/mean_length": 1868.921875, + "completions/mean_terminated_length": 1092.916748046875, + "completions/min_length": 620.0, + "completions/min_terminated_length": 620.0, + "epoch": 0.04228571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24795544147491455, + "learning_rate": 9.825677631722435e-07, + "loss": -0.0, + "num_tokens": 4661890.0, + "reward": -0.23053905367851257, + "reward_std": 0.34036368131637573, + "rewards/cosine_scaled_reward/mean": -0.2246445268392563, + "rewards/cosine_scaled_reward/std": 0.15942412614822388, + "rewards/format_reward/mean": 0.21875, + "rewards/format_reward/std": 0.4166666865348816, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1397.0, + "completions/mean_length": 1889.53125, + "completions/mean_terminated_length": 1033.800048828125, + "completions/min_length": 810.0, + "completions/min_terminated_length": 810.0, + "epoch": 0.04342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24283826351165771, + "learning_rate": 9.80337140183366e-07, + "loss": 0.0, + "num_tokens": 4794532.0, + "reward": -0.10043507814407349, + "reward_std": 0.47925832867622375, + "rewards/cosine_scaled_reward/mean": -0.13615503907203674, + "rewards/cosine_scaled_reward/std": 0.3336707651615143, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.38025420904159546, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1515.0, + "completions/mean_length": 1644.828125, + "completions/mean_terminated_length": 689.9473876953125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.044571428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28362998366355896, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0, + "num_tokens": 4910585.0, + "reward": 0.12284853309392929, + "reward_std": 0.4183085858821869, + "rewards/cosine_scaled_reward/mean": -0.11045074462890625, + "rewards/cosine_scaled_reward/std": 0.30217844247817993, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1632.0, + "completions/mean_length": 1618.28125, + "completions/mean_terminated_length": 902.0833740234375, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.045714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.262617826461792, + "learning_rate": 9.754833590196926e-07, + "loss": 0.0, + "num_tokens": 5024227.0, + "reward": 0.2076582908630371, + "reward_std": 0.42125773429870605, + "rewards/cosine_scaled_reward/mean": -0.12273336946964264, + "rewards/cosine_scaled_reward/std": 0.4404613971710205, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.501733124256134, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1914.0, + "completions/mean_length": 1717.734375, + "completions/mean_terminated_length": 1235.0384521484375, + "completions/min_length": 664.0, + "completions/min_terminated_length": 664.0, + "epoch": 0.046857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23294499516487122, + "learning_rate": 9.728616793536587e-07, + "loss": -0.0, + "num_tokens": 5145314.0, + "reward": 0.011502981185913086, + "reward_std": 0.6816084980964661, + "rewards/cosine_scaled_reward/mean": -0.22081100940704346, + "rewards/cosine_scaled_reward/std": 0.37589573860168457, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.501733124256134, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1672.0, + "completions/mean_length": 1703.921875, + "completions/mean_terminated_length": 579.933349609375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34672290086746216, + "learning_rate": 9.701111919237408e-07, + "loss": -0.0, + "num_tokens": 5264725.0, + "reward": -0.2616002857685089, + "reward_std": 0.37952175736427307, + "rewards/cosine_scaled_reward/mean": -0.26361262798309326, + "rewards/cosine_scaled_reward/std": 0.17531204223632812, + "rewards/format_reward/mean": 0.265625, + "rewards/format_reward/std": 0.44515693187713623, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1370.0, + "completions/mean_length": 1681.84375, + "completions/mean_terminated_length": 814.631591796875, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.04914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.263967901468277, + "learning_rate": 9.672327345550543e-07, + "loss": -0.0, + "num_tokens": 5383979.0, + "reward": 0.13376155495643616, + "reward_std": 0.46012288331985474, + "rewards/cosine_scaled_reward/mean": -0.08155670762062073, + "rewards/cosine_scaled_reward/std": 0.3612325191497803, + "rewards/format_reward/mean": 0.296875, + "rewards/format_reward/std": 0.4604927599430084, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 1624.625, + "completions/mean_terminated_length": 869.9130859375, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.05028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28927963972091675, + "learning_rate": 9.64227184053598e-07, + "loss": -0.0, + "num_tokens": 5498651.0, + "reward": 0.20869271457195282, + "reward_std": 0.5558150410652161, + "rewards/cosine_scaled_reward/mean": -0.0987786278128624, + "rewards/cosine_scaled_reward/std": 0.42912590503692627, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.49501484632492065, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1851.0, + "completions/mean_length": 2006.96875, + "completions/mean_terminated_length": 1522.800048828125, + "completions/min_length": 955.0, + "completions/min_terminated_length": 955.0, + "epoch": 0.05142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24254000186920166, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0, + "num_tokens": 5638753.0, + "reward": -0.2540697157382965, + "reward_std": 0.4600578844547272, + "rewards/cosine_scaled_reward/mean": -0.20515984296798706, + "rewards/cosine_scaled_reward/std": 0.3251590430736542, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36596253514289856, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1563.0, + "completions/mean_length": 1765.984375, + "completions/mean_terminated_length": 919.9375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "epoch": 0.052571428571428575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2645930349826813, + "learning_rate": 9.578385041664925e-07, + "loss": 0.0, + "num_tokens": 5762944.0, + "reward": -0.213707834482193, + "reward_std": 0.38778313994407654, + "rewards/cosine_scaled_reward/mean": -0.2318539321422577, + "rewards/cosine_scaled_reward/std": 0.21436986327171326, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1583.40625, + "completions/mean_terminated_length": 986.0714721679688, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.053714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.311797559261322, + "learning_rate": 9.54457320834625e-07, + "loss": 0.0, + "num_tokens": 5874682.0, + "reward": 0.27925533056259155, + "reward_std": 0.6467443704605103, + "rewards/cosine_scaled_reward/mean": -0.07912233471870422, + "rewards/cosine_scaled_reward/std": 0.4737093150615692, + "rewards/format_reward/mean": 0.4375, + "rewards/format_reward/std": 0.5, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 1690.0625, + "completions/mean_terminated_length": 1006.727294921875, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.054857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26644304394721985, + "learning_rate": 9.509529358847654e-07, + "loss": -0.0, + "num_tokens": 5993390.0, + "reward": 0.13692031800746918, + "reward_std": 0.5655145049095154, + "rewards/cosine_scaled_reward/mean": -0.12685233354568481, + "rewards/cosine_scaled_reward/std": 0.32320985198020935, + "rewards/format_reward/mean": 0.390625, + "rewards/format_reward/std": 0.4917473793029785, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1387.140625, + "completions/mean_terminated_length": 804.0294189453125, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3078882396221161, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0, + "num_tokens": 6092231.0, + "reward": 0.35559189319610596, + "reward_std": 0.5927403569221497, + "rewards/cosine_scaled_reward/mean": -0.09564155340194702, + "rewards/cosine_scaled_reward/std": 0.4046906530857086, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.501733124256134, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 1674.890625, + "completions/mean_terminated_length": 962.5909423828125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.05714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23925544321537018, + "learning_rate": 9.43578868212728e-07, + "loss": -0.0, + "num_tokens": 6210240.0, + "reward": 0.18573230504989624, + "reward_std": 0.5264967083930969, + "rewards/cosine_scaled_reward/mean": -0.09463384002447128, + "rewards/cosine_scaled_reward/std": 0.4100942015647888, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.48795005679130554, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1347.40625, + "completions/mean_terminated_length": 836.1621704101562, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.05828571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.325811505317688, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0, + "num_tokens": 6306682.0, + "reward": 0.1735648661851883, + "reward_std": 0.5335988998413086, + "rewards/cosine_scaled_reward/mean": -0.21009255945682526, + "rewards/cosine_scaled_reward/std": 0.2623959481716156, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.49501484632492065, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1390.0, + "completions/mean_length": 1727.765625, + "completions/mean_terminated_length": 767.0625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.05942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27392977476119995, + "learning_rate": 9.357252853159505e-07, + "loss": 0.0, + "num_tokens": 6428611.0, + "reward": -0.16267812252044678, + "reward_std": 0.5682471990585327, + "rewards/cosine_scaled_reward/mean": -0.2219640612602234, + "rewards/cosine_scaled_reward/std": 0.36739134788513184, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1907.0, + "completions/mean_length": 1609.171875, + "completions/mean_terminated_length": 924.5999755859375, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "epoch": 0.060571428571428575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28155064582824707, + "learning_rate": 9.316216432703916e-07, + "loss": -0.0, + "num_tokens": 6542430.0, + "reward": 0.0752667784690857, + "reward_std": 0.7118167281150818, + "rewards/cosine_scaled_reward/mean": -0.18892911076545715, + "rewards/cosine_scaled_reward/std": 0.3222156763076782, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.501733124256134, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1588.234375, + "completions/mean_terminated_length": 1067.166748046875, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "epoch": 0.061714285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2555343806743622, + "learning_rate": 9.274017555754407e-07, + "loss": 0.0, + "num_tokens": 6655221.0, + "reward": 0.6341299414634705, + "reward_std": 1.0656921863555908, + "rewards/cosine_scaled_reward/mean": 0.05143994837999344, + "rewards/cosine_scaled_reward/std": 0.5348308086395264, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.5029674172401428, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1420.0, + "completions/mean_length": 1549.5625, + "completions/mean_terminated_length": 821.0769653320312, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "epoch": 0.06285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30243629217147827, + "learning_rate": 9.230669076497687e-07, + "loss": -0.0, + "num_tokens": 6764681.0, + "reward": 0.13021975755691528, + "reward_std": 0.3984764516353607, + "rewards/cosine_scaled_reward/mean": -0.13801513612270355, + "rewards/cosine_scaled_reward/std": 0.41228073835372925, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.49501484632492065, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1633.25, + "completions/mean_terminated_length": 1132.689697265625, + "completions/min_length": 543.0, + "completions/min_terminated_length": 543.0, + "epoch": 0.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23835402727127075, + "learning_rate": 9.186184199300463e-07, + "loss": -0.0, + "num_tokens": 6880169.0, + "reward": 0.27981996536254883, + "reward_std": 0.5018116235733032, + "rewards/cosine_scaled_reward/mean": -0.10227750986814499, + "rewards/cosine_scaled_reward/std": 0.481824666261673, + "rewards/format_reward/mean": 0.484375, + "rewards/format_reward/std": 0.5037065148353577, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1807.0, + "completions/mean_length": 1699.875, + "completions/mean_terminated_length": 1156.7999267578125, + "completions/min_length": 642.0, + "completions/min_terminated_length": 642.0, + "epoch": 0.06514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22349494695663452, + "learning_rate": 9.140576474687263e-07, + "loss": 0.0, + "num_tokens": 7000529.0, + "reward": -0.026505012065172195, + "reward_std": 0.5785415172576904, + "rewards/cosine_scaled_reward/mean": -0.20856501162052155, + "rewards/cosine_scaled_reward/std": 0.2749907374382019, + "rewards/format_reward/mean": 0.390625, + "rewards/format_reward/std": 0.4917473793029785, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1457.875, + "completions/mean_terminated_length": 1054.105224609375, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "epoch": 0.06628571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.261942595243454, + "learning_rate": 9.093859795212817e-07, + "loss": 0.0, + "num_tokens": 7103929.0, + "reward": 0.5745843648910522, + "reward_std": 0.8671218156814575, + "rewards/cosine_scaled_reward/mean": -0.03302033245563507, + "rewards/cosine_scaled_reward/std": 0.45529407262802124, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4836103618144989, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1590.0625, + "completions/mean_terminated_length": 1159.8787841796875, + "completions/min_length": 591.0, + "completions/min_terminated_length": 591.0, + "epoch": 0.06742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24828943610191345, + "learning_rate": 9.046048391230247e-07, + "loss": -0.0, + "num_tokens": 7216157.0, + "reward": 0.3377103805541992, + "reward_std": 0.5543617010116577, + "rewards/cosine_scaled_reward/mean": -0.1045822948217392, + "rewards/cosine_scaled_reward/std": 0.39040952920913696, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.501733124256134, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 1622.84375, + "completions/mean_terminated_length": 1076.21435546875, + "completions/min_length": 555.0, + "completions/min_terminated_length": 555.0, + "epoch": 0.06857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2752656936645508, + "learning_rate": 8.997156826556369e-07, + "loss": -0.0, + "num_tokens": 7330907.0, + "reward": 0.11114693433046341, + "reward_std": 0.6926254034042358, + "rewards/cosine_scaled_reward/mean": -0.1788015365600586, + "rewards/cosine_scaled_reward/std": 0.39409172534942627, + "rewards/format_reward/mean": 0.46875, + "rewards/format_reward/std": 0.5029674172401428, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1708.859375, + "completions/mean_terminated_length": 1014.4285888671875, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.06971428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22669929265975952, + "learning_rate": 8.9471999940354e-07, + "loss": -0.0, + "num_tokens": 7451794.0, + "reward": 0.2345120906829834, + "reward_std": 0.6293160319328308, + "rewards/cosine_scaled_reward/mean": -0.1093064472079277, + "rewards/cosine_scaled_reward/std": 0.29189831018447876, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.501733124256134, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1281.53125, + "completions/mean_terminated_length": 1004.2978515625, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.07085714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25438693165779114, + "learning_rate": 8.896193111002475e-07, + "loss": 0.0, + "num_tokens": 7544044.0, + "reward": 0.9180847406387329, + "reward_std": 0.6390912532806396, + "rewards/cosine_scaled_reward/mean": 0.06841734796762466, + "rewards/cosine_scaled_reward/std": 0.48315128684043884, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4166666865348816, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1678.0, + "completions/mean_length": 1310.46875, + "completions/mean_terminated_length": 896.731689453125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28795576095581055, + "learning_rate": 8.844151714648274e-07, + "loss": -0.0, + "num_tokens": 7638170.0, + "reward": 0.6488770246505737, + "reward_std": 0.7876260876655579, + "rewards/cosine_scaled_reward/mean": -0.019311510026454926, + "rewards/cosine_scaled_reward/std": 0.4736698865890503, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.467176616191864, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 1307.625, + "completions/mean_terminated_length": 1039.8297119140625, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.07314285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25637197494506836, + "learning_rate": 8.791091657286267e-07, + "loss": -0.0, + "num_tokens": 7732810.0, + "reward": 0.8280279636383057, + "reward_std": 0.6804471015930176, + "rewards/cosine_scaled_reward/mean": 0.015576483681797981, + "rewards/cosine_scaled_reward/std": 0.44819310307502747, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.40550529956817627, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1846.0, + "completions/mean_length": 1322.125, + "completions/mean_terminated_length": 914.9268188476562, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.07428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2944399118423462, + "learning_rate": 8.737029101523929e-07, + "loss": -0.0, + "num_tokens": 7828130.0, + "reward": 0.15610456466674805, + "reward_std": 0.4606686234474182, + "rewards/cosine_scaled_reward/mean": -0.24226020276546478, + "rewards/cosine_scaled_reward/std": 0.33131492137908936, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4836103618144989, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1803.0, + "completions/mean_length": 1020.21875, + "completions/mean_terminated_length": 806.9057006835938, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.07542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32644009590148926, + "learning_rate": 8.681980515339463e-07, + "loss": 0.0, + "num_tokens": 7903656.0, + "reward": 0.7972471714019775, + "reward_std": 0.7674820423126221, + "rewards/cosine_scaled_reward/mean": -0.031063925474882126, + "rewards/cosine_scaled_reward/std": 0.5106223225593567, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1910.0, + "completions/mean_length": 1750.859375, + "completions/mean_terminated_length": 1142.4285888671875, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "epoch": 0.07657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2270829975605011, + "learning_rate": 8.625962667065487e-07, + "loss": 0.0, + "num_tokens": 8026447.0, + "reward": -0.1400720775127411, + "reward_std": 0.3325888514518738, + "rewards/cosine_scaled_reward/mean": -0.24972353875637054, + "rewards/cosine_scaled_reward/std": 0.16404789686203003, + "rewards/format_reward/mean": 0.359375, + "rewards/format_reward/std": 0.4836103618144989, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1424.0, + "completions/mean_length": 769.546875, + "completions/mean_terminated_length": 637.2930908203125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.07771428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37025144696235657, + "learning_rate": 8.568992620281243e-07, + "loss": -0.0, + "num_tokens": 8084954.0, + "reward": 0.9792699813842773, + "reward_std": 0.804767370223999, + "rewards/cosine_scaled_reward/mean": 0.03651002421975136, + "rewards/cosine_scaled_reward/std": 0.46041443943977356, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1701.0, + "completions/mean_length": 1086.234375, + "completions/mean_terminated_length": 886.6226806640625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.07885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3763800263404846, + "learning_rate": 8.511087728614862e-07, + "loss": 0.0, + "num_tokens": 8164817.0, + "reward": 0.35803771018981934, + "reward_std": 0.5702667236328125, + "rewards/cosine_scaled_reward/mean": -0.24285613000392914, + "rewards/cosine_scaled_reward/std": 0.3019825220108032, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1898.0, + "completions/mean_length": 1463.375, + "completions/mean_terminated_length": 1112.5999755859375, + "completions/min_length": 503.0, + "completions/min_terminated_length": 503.0, + "epoch": 0.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24232418835163116, + "learning_rate": 8.452265630457282e-07, + "loss": -0.0, + "num_tokens": 8269929.0, + "reward": 0.3703588843345642, + "reward_std": 0.7288752794265747, + "rewards/cosine_scaled_reward/mean": -0.1351330280303955, + "rewards/cosine_scaled_reward/std": 0.3751916289329529, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4836103618144989, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1409.859375, + "completions/mean_terminated_length": 973.2368774414062, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.08114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.300010621547699, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0, + "num_tokens": 8370880.0, + "reward": 0.5196826457977295, + "reward_std": 0.7097917795181274, + "rewards/cosine_scaled_reward/mean": -0.044846177101135254, + "rewards/cosine_scaled_reward/std": 0.508389949798584, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4917473793029785, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1801.0, + "completions/mean_length": 1228.046875, + "completions/mean_terminated_length": 931.4680786132812, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.08228571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30454304814338684, + "learning_rate": 8.331941759724268e-07, + "loss": -0.0, + "num_tokens": 8459827.0, + "reward": 0.41365131735801697, + "reward_std": 0.5005639791488647, + "rewards/cosine_scaled_reward/mean": -0.1759868562221527, + "rewards/cosine_scaled_reward/std": 0.19868774712085724, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42695629596710205, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1719.0, + "completions/mean_length": 1513.28125, + "completions/mean_terminated_length": 1192.4500732421875, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "epoch": 0.08342857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27848970890045166, + "learning_rate": 8.270476638965461e-07, + "loss": -0.0, + "num_tokens": 8567405.0, + "reward": 0.09570223093032837, + "reward_std": 0.5445049405097961, + "rewards/cosine_scaled_reward/mean": -0.2802739143371582, + "rewards/cosine_scaled_reward/std": 0.25603488087654114, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4787135720252991, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1888.0, + "completions/mean_length": 1240.125, + "completions/mean_terminated_length": 924.0, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "epoch": 0.08457142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2789021134376526, + "learning_rate": 8.208167604184217e-07, + "loss": 0.0, + "num_tokens": 8656701.0, + "reward": 0.7823752760887146, + "reward_std": 0.6479132175445557, + "rewards/cosine_scaled_reward/mean": 0.031812600791454315, + "rewards/cosine_scaled_reward/std": 0.5397623181343079, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4531635046005249, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1455.953125, + "completions/mean_terminated_length": 1186.8409423828125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "epoch": 0.08571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22443196177482605, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0, + "num_tokens": 8760842.0, + "reward": 0.8040015697479248, + "reward_std": 0.5675323009490967, + "rewards/cosine_scaled_reward/mean": 0.027000809088349342, + "rewards/cosine_scaled_reward/std": 0.5096040964126587, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4364357888698578, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1720.0, + "completions/mean_length": 1177.859375, + "completions/mean_terminated_length": 863.1276245117188, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.08685714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32647648453712463, + "learning_rate": 8.081093963579707e-07, + "loss": 0.0, + "num_tokens": 8846625.0, + "reward": 0.310506671667099, + "reward_std": 0.5110941529273987, + "rewards/cosine_scaled_reward/mean": -0.2119341641664505, + "rewards/cosine_scaled_reward/std": 0.24737994372844696, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44515693187713623, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1787.0, + "completions/mean_length": 1263.4375, + "completions/mean_terminated_length": 1043.760009765625, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "epoch": 0.088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2545543611049652, + "learning_rate": 8.01636806561836e-07, + "loss": -0.0, + "num_tokens": 8939061.0, + "reward": 0.5484907031059265, + "reward_std": 0.48998576402664185, + "rewards/cosine_scaled_reward/mean": -0.13200464844703674, + "rewards/cosine_scaled_reward/std": 0.3430649936199188, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.39339789748191833, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1460.78125, + "completions/mean_terminated_length": 1059.0, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.08914285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2583931088447571, + "learning_rate": 7.950875657567621e-07, + "loss": 0.0, + "num_tokens": 9043271.0, + "reward": 0.6075442433357239, + "reward_std": 0.6895643472671509, + "rewards/cosine_scaled_reward/mean": -0.0009153857827186584, + "rewards/cosine_scaled_reward/std": 0.48922818899154663, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4917473793029785, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1956.0, + "completions/mean_length": 1054.875, + "completions/mean_terminated_length": 892.3635864257812, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.09028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29089078307151794, + "learning_rate": 7.884636689049422e-07, + "loss": 0.0, + "num_tokens": 9120879.0, + "reward": 0.6885831356048584, + "reward_std": 0.508629322052002, + "rewards/cosine_scaled_reward/mean": -0.09320840239524841, + "rewards/cosine_scaled_reward/std": 0.38835227489471436, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1399.046875, + "completions/mean_terminated_length": 1145.1087646484375, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "epoch": 0.09142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27458345890045166, + "learning_rate": 7.817671337095244e-07, + "loss": 0.0, + "num_tokens": 9220810.0, + "reward": 0.5549384355545044, + "reward_std": 0.7092134952545166, + "rewards/cosine_scaled_reward/mean": -0.09753081202507019, + "rewards/cosine_scaled_reward/std": 0.4125780463218689, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4364357888698578, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1833.0, + "completions/mean_length": 1084.984375, + "completions/mean_terminated_length": 906.6481323242188, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.09257142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37247684597969055, + "learning_rate": 7.75e-07, + "loss": -0.0, + "num_tokens": 9301521.0, + "reward": 0.5357480049133301, + "reward_std": 0.5661624670028687, + "rewards/cosine_scaled_reward/mean": -0.18525099754333496, + "rewards/cosine_scaled_reward/std": 0.3385297954082489, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1260.921875, + "completions/mean_terminated_length": 998.5625, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.09371428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27329322695732117, + "learning_rate": 7.681643291108517e-07, + "loss": -0.0, + "num_tokens": 9392548.0, + "reward": 0.9478914737701416, + "reward_std": 0.4313860237598419, + "rewards/cosine_scaled_reward/mean": 0.09894578158855438, + "rewards/cosine_scaled_reward/std": 0.5477120876312256, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4364357888698578, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1309.671875, + "completions/mean_terminated_length": 922.9285888671875, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.09485714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3202998638153076, + "learning_rate": 7.612622032536507e-07, + "loss": -0.0, + "num_tokens": 9487455.0, + "reward": 0.5201998949050903, + "reward_std": 0.6858996152877808, + "rewards/cosine_scaled_reward/mean": -0.09927503764629364, + "rewards/cosine_scaled_reward/std": 0.37909674644470215, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4531635046005249, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1685.0, + "completions/mean_length": 1185.703125, + "completions/mean_terminated_length": 965.9019775390625, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29646041989326477, + "learning_rate": 7.54295724882796e-07, + "loss": -0.0, + "num_tokens": 9574036.0, + "reward": 0.6779025793075562, + "reward_std": 0.557724118232727, + "rewards/cosine_scaled_reward/mean": -0.09073619544506073, + "rewards/cosine_scaled_reward/std": 0.3855368196964264, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1297.828125, + "completions/mean_terminated_length": 1158.907470703125, + "completions/min_length": 601.0, + "completions/min_terminated_length": 601.0, + "epoch": 0.09714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21307455003261566, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0, + "num_tokens": 9667417.0, + "reward": 0.5093189477920532, + "reward_std": 0.6006681323051453, + "rewards/cosine_scaled_reward/mean": -0.1672155261039734, + "rewards/cosine_scaled_reward/std": 0.34896284341812134, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1859.0, + "completions/mean_length": 1348.90625, + "completions/mean_terminated_length": 1096.04248046875, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "epoch": 0.09828571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2883393168449402, + "learning_rate": 7.401782177833147e-07, + "loss": -0.0, + "num_tokens": 9764603.0, + "reward": 0.8025823831558228, + "reward_std": 0.547119677066803, + "rewards/cosine_scaled_reward/mean": 0.01847870647907257, + "rewards/cosine_scaled_reward/std": 0.4346420168876648, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42695629596710205, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1782.0, + "completions/mean_length": 1086.96875, + "completions/mean_terminated_length": 909.0, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.09942857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31888866424560547, + "learning_rate": 7.330314893841101e-07, + "loss": -0.0, + "num_tokens": 9844289.0, + "reward": 0.5533354878425598, + "reward_std": 0.5319498777389526, + "rewards/cosine_scaled_reward/mean": -0.1530197560787201, + "rewards/cosine_scaled_reward/std": 0.2434682846069336, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 954.921875, + "completions/mean_terminated_length": 919.6612548828125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.10057142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3025936484336853, + "learning_rate": 7.258290078201731e-07, + "loss": -0.0, + "num_tokens": 9915916.0, + "reward": 1.2692296504974365, + "reward_std": 0.5115163326263428, + "rewards/cosine_scaled_reward/mean": 0.13461479544639587, + "rewards/cosine_scaled_reward/std": 0.506001353263855, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 1351.8125, + "completions/mean_terminated_length": 1174.35302734375, + "completions/min_length": 650.0, + "completions/min_terminated_length": 650.0, + "epoch": 0.10171428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23423585295677185, + "learning_rate": 7.185729670371604e-07, + "loss": -0.0, + "num_tokens": 10013432.0, + "reward": 0.724889874458313, + "reward_std": 0.7425336837768555, + "rewards/cosine_scaled_reward/mean": -0.0828675627708435, + "rewards/cosine_scaled_reward/std": 0.3893774449825287, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1906.0, + "completions/mean_length": 1153.28125, + "completions/mean_terminated_length": 1025.46435546875, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "epoch": 0.10285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3860023021697998, + "learning_rate": 7.11265577295385e-07, + "loss": -0.0, + "num_tokens": 10097242.0, + "reward": 0.5000253915786743, + "reward_std": 0.5103108286857605, + "rewards/cosine_scaled_reward/mean": -0.18748730421066284, + "rewards/cosine_scaled_reward/std": 0.2787182629108429, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1455.484375, + "completions/mean_terminated_length": 1166.1163330078125, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "epoch": 0.104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2551063895225525, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0, + "num_tokens": 10200961.0, + "reward": 0.4053259789943695, + "reward_std": 0.663999617099762, + "rewards/cosine_scaled_reward/mean": -0.18796202540397644, + "rewards/cosine_scaled_reward/std": 0.35777655243873596, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4166666865348816, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1176.953125, + "completions/mean_terminated_length": 1015.6481323242188, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "epoch": 0.10514285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27449366450309753, + "learning_rate": 6.965056695057204e-07, + "loss": -0.0, + "num_tokens": 10286278.0, + "reward": 0.5743436217308044, + "reward_std": 0.6229422092437744, + "rewards/cosine_scaled_reward/mean": -0.15032817423343658, + "rewards/cosine_scaled_reward/std": 0.2899566888809204, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1434.875, + "completions/mean_terminated_length": 1156.181884765625, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.10628571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2839376926422119, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0, + "num_tokens": 10389454.0, + "reward": 0.30658647418022156, + "reward_std": 0.5343226194381714, + "rewards/cosine_scaled_reward/mean": -0.22951926290988922, + "rewards/cosine_scaled_reward/std": 0.2324177473783493, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42695629596710205, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1684.0, + "completions/mean_length": 1242.390625, + "completions/mean_terminated_length": 927.1522216796875, + "completions/min_length": 508.0, + "completions/min_terminated_length": 508.0, + "epoch": 0.10742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2985072433948517, + "learning_rate": 6.815672671252315e-07, + "loss": 0.0, + "num_tokens": 10478735.0, + "reward": 0.6593698263168335, + "reward_std": 0.5845412015914917, + "rewards/cosine_scaled_reward/mean": -0.02969011664390564, + "rewards/cosine_scaled_reward/std": 0.47056320309638977, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4531635046005249, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1931.0, + "completions/mean_length": 1203.265625, + "completions/mean_terminated_length": 1082.58935546875, + "completions/min_length": 573.0, + "completions/min_terminated_length": 573.0, + "epoch": 0.10857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2689598798751831, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0, + "num_tokens": 10566272.0, + "reward": 0.4301251173019409, + "reward_std": 0.4795047640800476, + "rewards/cosine_scaled_reward/mean": -0.22243742644786835, + "rewards/cosine_scaled_reward/std": 0.2575407326221466, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1827.0, + "completions/mean_length": 1205.5625, + "completions/mean_terminated_length": 990.8235473632812, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.10971428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30502915382385254, + "learning_rate": 6.664685702961344e-07, + "loss": -0.0, + "num_tokens": 10654564.0, + "reward": 0.896080493927002, + "reward_std": 0.6987663507461548, + "rewards/cosine_scaled_reward/mean": 0.02616523765027523, + "rewards/cosine_scaled_reward/std": 0.460237056016922, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1777.0, + "completions/mean_length": 1170.390625, + "completions/mean_terminated_length": 988.2453002929688, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.11085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3103901743888855, + "learning_rate": 6.588648530198504e-07, + "loss": -0.0, + "num_tokens": 10739733.0, + "reward": 0.6633297204971313, + "reward_std": 0.609075665473938, + "rewards/cosine_scaled_reward/mean": -0.12927262485027313, + "rewards/cosine_scaled_reward/std": 0.4114542305469513, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1817.0, + "completions/mean_length": 1136.5625, + "completions/mean_terminated_length": 947.396240234375, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "epoch": 0.112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2510873079299927, + "learning_rate": 6.512279744547392e-07, + "loss": 0.0, + "num_tokens": 10823537.0, + "reward": 0.6613268256187439, + "reward_std": 0.4785424768924713, + "rewards/cosine_scaled_reward/mean": -0.09902409464120865, + "rewards/cosine_scaled_reward/std": 0.4345317482948303, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1171.8125, + "completions/mean_terminated_length": 1081.17236328125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.11314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.281054824590683, + "learning_rate": 6.435602608679916e-07, + "loss": -0.0, + "num_tokens": 10909701.0, + "reward": 1.0416245460510254, + "reward_std": 0.6949809789657593, + "rewards/cosine_scaled_reward/mean": 0.0520622618496418, + "rewards/cosine_scaled_reward/std": 0.508481502532959, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24397502839565277, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 1120.8125, + "completions/mean_terminated_length": 1024.8966064453125, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.11428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2910788655281067, + "learning_rate": 6.358640479194451e-07, + "loss": 0.0, + "num_tokens": 10991145.0, + "reward": 1.2036188840866089, + "reward_std": 0.8533884286880493, + "rewards/cosine_scaled_reward/mean": 0.14087192714214325, + "rewards/cosine_scaled_reward/std": 0.5375887751579285, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 1076.953125, + "completions/mean_terminated_length": 1029.1966552734375, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "epoch": 0.11542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33955609798431396, + "learning_rate": 6.281416799501187e-07, + "loss": 0.0, + "num_tokens": 11071502.0, + "reward": 0.7810705900192261, + "reward_std": 0.5973731279373169, + "rewards/cosine_scaled_reward/mean": -0.10165221989154816, + "rewards/cosine_scaled_reward/std": 0.4130260646343231, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 1092.078125, + "completions/mean_terminated_length": 935.654541015625, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.11657142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34537607431411743, + "learning_rate": 6.203955092681039e-07, + "loss": 0.0, + "num_tokens": 11151547.0, + "reward": 0.6441041231155396, + "reward_std": 0.53089839220047, + "rewards/cosine_scaled_reward/mean": -0.10763543844223022, + "rewards/cosine_scaled_reward/std": 0.39948928356170654, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1120.625, + "completions/mean_terminated_length": 1006.7368774414062, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.11771428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.343980997800827, + "learning_rate": 6.126278954320294e-07, + "loss": 0.0, + "num_tokens": 11233619.0, + "reward": 0.6925251483917236, + "reward_std": 0.5938367247581482, + "rewards/cosine_scaled_reward/mean": -0.13029994070529938, + "rewards/cosine_scaled_reward/std": 0.37749138474464417, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21304203569889069, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1913.0, + "completions/mean_length": 1120.359375, + "completions/mean_terminated_length": 948.5740966796875, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.11885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30854102969169617, + "learning_rate": 6.048412045323164e-07, + "loss": -0.0, + "num_tokens": 11315786.0, + "reward": 0.560060977935791, + "reward_std": 0.5216183662414551, + "rewards/cosine_scaled_reward/mean": -0.1418444812297821, + "rewards/cosine_scaled_reward/std": 0.33836889266967773, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1897.0, + "completions/mean_length": 1158.421875, + "completions/mean_terminated_length": 953.1346435546875, + "completions/min_length": 503.0, + "completions/min_terminated_length": 503.0, + "epoch": 0.12, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29531243443489075, + "learning_rate": 5.97037808470444e-07, + "loss": -0.0, + "num_tokens": 11401213.0, + "reward": 1.0410652160644531, + "reward_std": 0.7858219742774963, + "rewards/cosine_scaled_reward/mean": 0.09084508568048477, + "rewards/cosine_scaled_reward/std": 0.5061684250831604, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1867.0, + "completions/mean_length": 1045.859375, + "completions/mean_terminated_length": 837.867919921875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.12114285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26259294152259827, + "learning_rate": 5.892200842364462e-07, + "loss": -0.0, + "num_tokens": 11478980.0, + "reward": 1.0545225143432617, + "reward_std": 0.7633667588233948, + "rewards/cosine_scaled_reward/mean": 0.07413630187511444, + "rewards/cosine_scaled_reward/std": 0.48842984437942505, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1101.234375, + "completions/mean_terminated_length": 946.30908203125, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.12228571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3363504409790039, + "learning_rate": 5.813904131848564e-07, + "loss": 0.0, + "num_tokens": 11560611.0, + "reward": 0.648673415184021, + "reward_std": 0.6051540970802307, + "rewards/cosine_scaled_reward/mean": -0.11316327750682831, + "rewards/cosine_scaled_reward/std": 0.37149766087532043, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1857.0, + "completions/mean_length": 1225.28125, + "completions/mean_terminated_length": 1054.5283203125, + "completions/min_length": 515.0, + "completions/min_terminated_length": 515.0, + "epoch": 0.12342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2867675721645355, + "learning_rate": 5.735511803093248e-07, + "loss": 0.0, + "num_tokens": 11649389.0, + "reward": 0.560509204864502, + "reward_std": 0.6691359877586365, + "rewards/cosine_scaled_reward/mean": -0.14943289756774902, + "rewards/cosine_scaled_reward/std": 0.4461749494075775, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1227.203125, + "completions/mean_terminated_length": 1056.84912109375, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "epoch": 0.12457142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2772690951824188, + "learning_rate": 5.657047735161255e-07, + "loss": -0.0, + "num_tokens": 11739178.0, + "reward": 0.6980891227722168, + "reward_std": 0.624833345413208, + "rewards/cosine_scaled_reward/mean": -0.0650179386138916, + "rewards/cosine_scaled_reward/std": 0.41062912344932556, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38025420904159546, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1727.0, + "completions/mean_length": 1145.0, + "completions/mean_terminated_length": 914.8235473632812, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.12571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3468596637248993, + "learning_rate": 5.578535828967777e-07, + "loss": -0.0, + "num_tokens": 11823234.0, + "reward": 0.6972323656082153, + "reward_std": 0.5477026104927063, + "rewards/cosine_scaled_reward/mean": -0.08888379484415054, + "rewards/cosine_scaled_reward/std": 0.3565239906311035, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1969.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 977.046875, + "completions/mean_terminated_length": 977.046875, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.12685714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3180137574672699, + "learning_rate": 5.5e-07, + "loss": 0.0, + "num_tokens": 11895885.0, + "reward": 0.8744360208511353, + "reward_std": 0.5815237164497375, + "rewards/cosine_scaled_reward/mean": -0.06278196722269058, + "rewards/cosine_scaled_reward/std": 0.37791064381599426, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1954.0, + "completions/mean_length": 1269.421875, + "completions/mean_terminated_length": 1089.75, + "completions/min_length": 605.0, + "completions/min_terminated_length": 605.0, + "epoch": 0.128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2817465364933014, + "learning_rate": 5.421464171032224e-07, + "loss": -0.0, + "num_tokens": 11988224.0, + "reward": 0.9151681065559387, + "reward_std": 0.594943642616272, + "rewards/cosine_scaled_reward/mean": 0.02789657562971115, + "rewards/cosine_scaled_reward/std": 0.4965399205684662, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1910.0, + "completions/max_terminated_length": 1910.0, + "completions/mean_length": 934.578125, + "completions/mean_terminated_length": 934.578125, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.12914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3341560959815979, + "learning_rate": 5.342952264838747e-07, + "loss": -0.0, + "num_tokens": 12058333.0, + "reward": 1.0256879329681396, + "reward_std": 0.717230498790741, + "rewards/cosine_scaled_reward/mean": 0.02065650373697281, + "rewards/cosine_scaled_reward/std": 0.4963410794734955, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 1055.21875, + "completions/mean_terminated_length": 971.0847778320312, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.13028571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3800676763057709, + "learning_rate": 5.264488196906752e-07, + "loss": -0.0, + "num_tokens": 12135715.0, + "reward": 0.649993896484375, + "reward_std": 0.5865596532821655, + "rewards/cosine_scaled_reward/mean": -0.1750030517578125, + "rewards/cosine_scaled_reward/std": 0.3388007879257202, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 1169.671875, + "completions/mean_terminated_length": 987.3773803710938, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.13142857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3112519085407257, + "learning_rate": 5.186095868151436e-07, + "loss": 0.0, + "num_tokens": 12221790.0, + "reward": 0.7184536457061768, + "reward_std": 0.44992831349372864, + "rewards/cosine_scaled_reward/mean": -0.06264819949865341, + "rewards/cosine_scaled_reward/std": 0.44565486907958984, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1936.0, + "completions/mean_length": 1224.890625, + "completions/mean_terminated_length": 1072.4630126953125, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "epoch": 0.13257142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2884223461151123, + "learning_rate": 5.107799157635538e-07, + "loss": 0.0, + "num_tokens": 12311567.0, + "reward": 0.8372049927711487, + "reward_std": 0.608986496925354, + "rewards/cosine_scaled_reward/mean": -0.026710007339715958, + "rewards/cosine_scaled_reward/std": 0.4437602162361145, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 1078.65625, + "completions/mean_terminated_length": 1030.9835205078125, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "epoch": 0.1337142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3016076385974884, + "learning_rate": 5.02962191529556e-07, + "loss": -0.0, + "num_tokens": 12391625.0, + "reward": 0.8182538747787476, + "reward_std": 0.6463132500648499, + "rewards/cosine_scaled_reward/mean": -0.09087307006120682, + "rewards/cosine_scaled_reward/std": 0.3895137310028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 1226.046875, + "completions/mean_terminated_length": 952.0625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.13485714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2991194427013397, + "learning_rate": 4.951587954676837e-07, + "loss": 0.0, + "num_tokens": 12480628.0, + "reward": 0.6370267868041992, + "reward_std": 0.7525250911712646, + "rewards/cosine_scaled_reward/mean": -0.056486621499061584, + "rewards/cosine_scaled_reward/std": 0.44576171040534973, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4364357888698578, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1038.96875, + "completions/mean_terminated_length": 894.8214721679688, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4483291506767273, + "learning_rate": 4.873721045679706e-07, + "loss": 0.0, + "num_tokens": 12557530.0, + "reward": 0.9855979084968567, + "reward_std": 0.6055079698562622, + "rewards/cosine_scaled_reward/mean": 0.04748644679784775, + "rewards/cosine_scaled_reward/std": 0.47108832001686096, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 856.578125, + "completions/mean_terminated_length": 818.1451416015625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.13714285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3406151831150055, + "learning_rate": 4.79604490731896e-07, + "loss": -0.0, + "num_tokens": 12622807.0, + "reward": 0.7979192733764648, + "reward_std": 0.6180044412612915, + "rewards/cosine_scaled_reward/mean": -0.10104038566350937, + "rewards/cosine_scaled_reward/std": 0.44317325949668884, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1764.0, + "completions/mean_length": 726.34375, + "completions/mean_terminated_length": 683.7096557617188, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.1382857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4178949296474457, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.0, + "num_tokens": 12678989.0, + "reward": 1.161607265472412, + "reward_std": 0.6393733024597168, + "rewards/cosine_scaled_reward/mean": 0.08080361783504486, + "rewards/cosine_scaled_reward/std": 0.5313310027122498, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1133.796875, + "completions/mean_terminated_length": 1039.22412109375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.13942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3333284258842468, + "learning_rate": 4.641359520805548e-07, + "loss": 0.0, + "num_tokens": 12763112.0, + "reward": 0.9356573820114136, + "reward_std": 0.6247758269309998, + "rewards/cosine_scaled_reward/mean": -0.02435879409313202, + "rewards/cosine_scaled_reward/std": 0.4759780466556549, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1031.296875, + "completions/mean_terminated_length": 981.2950439453125, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "epoch": 0.14057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29939791560173035, + "learning_rate": 4.5643973913200837e-07, + "loss": -0.0, + "num_tokens": 12839347.0, + "reward": 0.7725162506103516, + "reward_std": 0.5560778379440308, + "rewards/cosine_scaled_reward/mean": -0.09811685979366302, + "rewards/cosine_scaled_reward/std": 0.3822804391384125, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 979.234375, + "completions/mean_terminated_length": 944.758056640625, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.1417142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34992095828056335, + "learning_rate": 4.4877202554526084e-07, + "loss": 0.0, + "num_tokens": 12912970.0, + "reward": 1.085427165031433, + "reward_std": 0.6837464570999146, + "rewards/cosine_scaled_reward/mean": 0.05052608996629715, + "rewards/cosine_scaled_reward/std": 0.4791998267173767, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1076.40625, + "completions/mean_terminated_length": 994.0678100585938, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.14285714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27060386538505554, + "learning_rate": 4.4113514698014953e-07, + "loss": -0.0, + "num_tokens": 12992788.0, + "reward": 1.0397578477859497, + "reward_std": 0.43823006749153137, + "rewards/cosine_scaled_reward/mean": 0.019878946244716644, + "rewards/cosine_scaled_reward/std": 0.46214956045150757, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1856.0, + "completions/mean_length": 1071.53125, + "completions/mean_terminated_length": 1006.4334106445312, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "epoch": 0.144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2776121497154236, + "learning_rate": 4.3353142970386557e-07, + "loss": 0.0, + "num_tokens": 13072662.0, + "reward": 1.0028693675994873, + "reward_std": 0.6879971027374268, + "rewards/cosine_scaled_reward/mean": 0.0014346465468406677, + "rewards/cosine_scaled_reward/std": 0.42488595843315125, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1868.0, + "completions/mean_length": 1180.484375, + "completions/mean_terminated_length": 1056.5535888671875, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.14514285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2829054594039917, + "learning_rate": 4.2596318988235037e-07, + "loss": -0.0, + "num_tokens": 13159309.0, + "reward": 0.6576684713363647, + "reward_std": 0.66895592212677, + "rewards/cosine_scaled_reward/mean": -0.15554077923297882, + "rewards/cosine_scaled_reward/std": 0.3959099054336548, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1869.0, + "completions/mean_length": 1053.328125, + "completions/mean_terminated_length": 950.4310302734375, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.1462857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29738253355026245, + "learning_rate": 4.1843273287476854e-07, + "loss": -0.0, + "num_tokens": 13237074.0, + "reward": 0.8851851224899292, + "reward_std": 0.7390589118003845, + "rewards/cosine_scaled_reward/mean": -0.041782446205616, + "rewards/cosine_scaled_reward/std": 0.46901625394821167, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 1228.484375, + "completions/mean_terminated_length": 1111.4107666015625, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.14742857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25943535566329956, + "learning_rate": 4.1094235253127374e-07, + "loss": -0.0, + "num_tokens": 13326401.0, + "reward": 0.9628820419311523, + "reward_std": 0.6490253210067749, + "rewards/cosine_scaled_reward/mean": 0.004878522828221321, + "rewards/cosine_scaled_reward/std": 0.45456331968307495, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21304203569889069, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 1089.578125, + "completions/mean_terminated_length": 952.6607666015625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.14857142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3009719252586365, + "learning_rate": 4.034943304942796e-07, + "loss": 0.0, + "num_tokens": 13406638.0, + "reward": 0.5984547138214111, + "reward_std": 0.7008002996444702, + "rewards/cosine_scaled_reward/mean": -0.14608514308929443, + "rewards/cosine_scaled_reward/std": 0.37894922494888306, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1651.0, + "completions/mean_length": 1058.03125, + "completions/mean_terminated_length": 916.607177734375, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.14971428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.306725412607193, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0, + "num_tokens": 13484088.0, + "reward": 1.0469268560409546, + "reward_std": 0.6023457050323486, + "rewards/cosine_scaled_reward/mean": 0.0703384131193161, + "rewards/cosine_scaled_reward/std": 0.47298464179039, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1773.0, + "completions/mean_length": 1342.78125, + "completions/mean_terminated_length": 919.6500244140625, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.15085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3032574951648712, + "learning_rate": 3.8873442270461485e-07, + "loss": -0.0, + "num_tokens": 13581090.0, + "reward": 0.4643245339393616, + "reward_std": 0.7533800601959229, + "rewards/cosine_scaled_reward/mean": -0.06471271812915802, + "rewards/cosine_scaled_reward/std": 0.4610835611820221, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.49501484632492065, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 1144.921875, + "completions/mean_terminated_length": 957.4906005859375, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "epoch": 0.152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32285141944885254, + "learning_rate": 3.8142703296283953e-07, + "loss": 0.0, + "num_tokens": 13665589.0, + "reward": 0.5014957189559937, + "reward_std": 0.5352932214736938, + "rewards/cosine_scaled_reward/mean": -0.17112717032432556, + "rewards/cosine_scaled_reward/std": 0.28127768635749817, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 975.53125, + "completions/mean_terminated_length": 958.5079956054688, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.15314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.40716752409935, + "learning_rate": 3.7417099217982686e-07, + "loss": -0.0, + "num_tokens": 13738591.0, + "reward": 1.1759617328643799, + "reward_std": 0.4804629683494568, + "rewards/cosine_scaled_reward/mean": 0.08798093348741531, + "rewards/cosine_scaled_reward/std": 0.5343761444091797, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1686.0, + "completions/max_terminated_length": 1686.0, + "completions/mean_length": 758.515625, + "completions/mean_terminated_length": 758.515625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.15428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42696353793144226, + "learning_rate": 3.6696851061588994e-07, + "loss": -0.0, + "num_tokens": 13797608.0, + "reward": 1.3851683139801025, + "reward_std": 0.5234883427619934, + "rewards/cosine_scaled_reward/mean": 0.19258417189121246, + "rewards/cosine_scaled_reward/std": 0.49346473813056946, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1169.875, + "completions/mean_terminated_length": 1095.4576416015625, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.15542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28027620911598206, + "learning_rate": 3.5982178221668533e-07, + "loss": -0.0, + "num_tokens": 13883152.0, + "reward": 1.0174503326416016, + "reward_std": 0.5889347791671753, + "rewards/cosine_scaled_reward/mean": 0.016537662595510483, + "rewards/cosine_scaled_reward/std": 0.4763922095298767, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1814.0, + "completions/mean_length": 1105.3125, + "completions/mean_terminated_length": 1042.4666748046875, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "epoch": 0.15657142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3002299666404724, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0, + "num_tokens": 13964500.0, + "reward": 0.841381847858429, + "reward_std": 0.6354345083236694, + "rewards/cosine_scaled_reward/mean": -0.07149658352136612, + "rewards/cosine_scaled_reward/std": 0.4138363003730774, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 1125.484375, + "completions/mean_terminated_length": 974.5272216796875, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.15771428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28766506910324097, + "learning_rate": 3.45704275117204e-07, + "loss": -0.0, + "num_tokens": 14047843.0, + "reward": 0.8758631944656372, + "reward_std": 0.7212573289871216, + "rewards/cosine_scaled_reward/mean": -0.05425591766834259, + "rewards/cosine_scaled_reward/std": 0.4783853590488434, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1216.171875, + "completions/mean_terminated_length": 1160.7166748046875, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.15885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2882857024669647, + "learning_rate": 3.387377967463493e-07, + "loss": -0.0, + "num_tokens": 14136318.0, + "reward": 0.7189284563064575, + "reward_std": 0.4593912959098816, + "rewards/cosine_scaled_reward/mean": -0.13272328674793243, + "rewards/cosine_scaled_reward/std": 0.33584704995155334, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1142.140625, + "completions/mean_terminated_length": 1012.732177734375, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.16, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.3000667095184326, + "learning_rate": 3.3183567088914833e-07, + "loss": 0.0, + "num_tokens": 14219639.0, + "reward": 0.8278639316558838, + "reward_std": 0.46724599599838257, + "rewards/cosine_scaled_reward/mean": -0.03919300064444542, + "rewards/cosine_scaled_reward/std": 0.4650508463382721, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1919.0, + "completions/mean_length": 1025.421875, + "completions/mean_terminated_length": 975.131103515625, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.16114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3207882046699524, + "learning_rate": 3.250000000000001e-07, + "loss": 0.0, + "num_tokens": 14295826.0, + "reward": 0.8871637582778931, + "reward_std": 0.6538586616516113, + "rewards/cosine_scaled_reward/mean": -0.04079316183924675, + "rewards/cosine_scaled_reward/std": 0.43451616168022156, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1233.90625, + "completions/mean_terminated_length": 1149.689697265625, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.16228571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3009903132915497, + "learning_rate": 3.182328662904756e-07, + "loss": 0.0, + "num_tokens": 14385300.0, + "reward": 0.8573208451271057, + "reward_std": 0.6099269390106201, + "rewards/cosine_scaled_reward/mean": -0.055714573711156845, + "rewards/cosine_scaled_reward/std": 0.43728360533714294, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 1136.078125, + "completions/mean_terminated_length": 1005.8035888671875, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.16342857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31794917583465576, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0, + "num_tokens": 14468825.0, + "reward": 0.6553314924240112, + "reward_std": 0.6344339847564697, + "rewards/cosine_scaled_reward/mean": -0.11764675378799438, + "rewards/cosine_scaled_reward/std": 0.3099633455276489, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 1220.6875, + "completions/mean_terminated_length": 1029.769287109375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.16457142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3814108967781067, + "learning_rate": 3.0491243424323783e-07, + "loss": 0.0, + "num_tokens": 14558437.0, + "reward": 0.7285318970680237, + "reward_std": 0.8925961256027222, + "rewards/cosine_scaled_reward/mean": -0.05760904401540756, + "rewards/cosine_scaled_reward/std": 0.492266446352005, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1880.0, + "completions/mean_length": 969.796875, + "completions/mean_terminated_length": 916.7704467773438, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.1657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3201180398464203, + "learning_rate": 2.9836319343816397e-07, + "loss": -0.0, + "num_tokens": 14630448.0, + "reward": 0.8149441480636597, + "reward_std": 0.5824600458145142, + "rewards/cosine_scaled_reward/mean": -0.08471541851758957, + "rewards/cosine_scaled_reward/std": 0.475755512714386, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1943.0, + "completions/mean_length": 1034.484375, + "completions/mean_terminated_length": 966.9166870117188, + "completions/min_length": 482.0, + "completions/min_terminated_length": 482.0, + "epoch": 0.16685714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28184273838996887, + "learning_rate": 2.918906036420294e-07, + "loss": -0.0, + "num_tokens": 14707271.0, + "reward": 0.8387603759765625, + "reward_std": 0.5346506237983704, + "rewards/cosine_scaled_reward/mean": -0.07280732691287994, + "rewards/cosine_scaled_reward/std": 0.43024110794067383, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 1249.984375, + "completions/mean_terminated_length": 1046.568603515625, + "completions/min_length": 550.0, + "completions/min_terminated_length": 550.0, + "epoch": 0.168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32145801186561584, + "learning_rate": 2.854966364683872e-07, + "loss": 0.0, + "num_tokens": 14798054.0, + "reward": 0.7505484819412231, + "reward_std": 0.5473448634147644, + "rewards/cosine_scaled_reward/mean": -0.07003828883171082, + "rewards/cosine_scaled_reward/std": 0.4046306014060974, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1844.0, + "completions/mean_length": 1062.828125, + "completions/mean_terminated_length": 960.913818359375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.16914285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2667451500892639, + "learning_rate": 2.791832395815782e-07, + "loss": -0.0, + "num_tokens": 14877259.0, + "reward": 0.7823130488395691, + "reward_std": 0.48230016231536865, + "rewards/cosine_scaled_reward/mean": -0.06978099048137665, + "rewards/cosine_scaled_reward/std": 0.37567150592803955, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1386.875, + "completions/mean_terminated_length": 1086.3636474609375, + "completions/min_length": 439.0, + "completions/min_terminated_length": 439.0, + "epoch": 0.1702857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2730913758277893, + "learning_rate": 2.729523361034538e-07, + "loss": 0.0, + "num_tokens": 14977915.0, + "reward": 0.48214927315711975, + "reward_std": 0.8376681804656982, + "rewards/cosine_scaled_reward/mean": -0.14173786342144012, + "rewards/cosine_scaled_reward/std": 0.4272434711456299, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42695629596710205, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1831.0, + "completions/mean_length": 994.15625, + "completions/mean_terminated_length": 942.3278198242188, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.17142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2946690022945404, + "learning_rate": 2.6680582402757324e-07, + "loss": -0.0, + "num_tokens": 15052045.0, + "reward": 0.8893749713897705, + "reward_std": 0.7130615711212158, + "rewards/cosine_scaled_reward/mean": -0.05531252920627594, + "rewards/cosine_scaled_reward/std": 0.4389563202857971, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 150 + } + ], + "logging_steps": 1, + "max_steps": 200, + "num_input_tokens_seen": 15052045, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-150/training_args.bin b/checkpoint-150/training_args.bin new file mode 100644 index 0000000..9e03ee7 --- /dev/null +++ b/checkpoint-150/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec4fbe7e29ae78abab9c9fd5555870c1bffe7656ceef4ac5fa2148a15b61b1e3 +size 8888 diff --git a/checkpoint-150/zero_to_fp32.py b/checkpoint-150/zero_to_fp32.py new file mode 100644 index 0000000..0e75914 --- /dev/null +++ b/checkpoint-150/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-200/config.json b/checkpoint-200/config.json new file mode 100644 index 0000000..78fed5b --- /dev/null +++ b/checkpoint-200/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": 4096, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "use_cache": false, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/checkpoint-200/generation_config.json b/checkpoint-200/generation_config.json new file mode 100644 index 0000000..92878bd --- /dev/null +++ b/checkpoint-200/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 151646, + "do_sample": true, + "eos_token_id": 151643, + "temperature": 0.6, + "top_p": 0.95, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..417b7cd --- /dev/null +++ b/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d31f8d912d8556b877b73713942a1ef3d16f102e8e4af8bc8ca1b2cd30ee2e32 +size 5331274140 diff --git a/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..2f9cc40 --- /dev/null +++ b/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92dffb8fd9dea596337dd9e4424b7befb9d707ee8ec9e2114e09d35a9c62619d +size 5331276572 diff --git a/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..5454dd2 --- /dev/null +++ b/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5be5a1e6f75a65d11bcbfdaba184b16fc04b2e29d0eb18df1d50df736d8c195f +size 5331276892 diff --git a/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..ea9dfd5 --- /dev/null +++ b/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a9b594b87c6bd7cee5405dfdf7461ad4ddfd8bece628fc8757a62d5ac4149c7 +size 5331273884 diff --git a/checkpoint-200/global_step200/mp_rank_00_model_states.pt b/checkpoint-200/global_step200/mp_rank_00_model_states.pt new file mode 100644 index 0000000..7c44f9f --- /dev/null +++ b/checkpoint-200/global_step200/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a0586a5a489ebb43c544e6b0f40d6faf0fcfc162cfaad7b8d98f2f24c647cef +size 3554267640 diff --git a/checkpoint-200/latest b/checkpoint-200/latest new file mode 100644 index 0000000..753e24e --- /dev/null +++ b/checkpoint-200/latest @@ -0,0 +1 @@ +global_step200 \ No newline at end of file diff --git a/checkpoint-200/model.safetensors b/checkpoint-200/model.safetensors new file mode 100644 index 0000000..24c1613 --- /dev/null +++ b/checkpoint-200/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:721e7cd7a52fbe85031e588ef9dd53b84820dc30295efc7a202ec5bf16e6a44d +size 3554214752 diff --git a/checkpoint-200/rng_state_0.pth b/checkpoint-200/rng_state_0.pth new file mode 100644 index 0000000..86f2a23 --- /dev/null +++ b/checkpoint-200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d27bca98fc43661d89f342b159db8ba39985151ad393ad050976214ea15c356 +size 14960 diff --git a/checkpoint-200/rng_state_1.pth b/checkpoint-200/rng_state_1.pth new file mode 100644 index 0000000..cb38291 --- /dev/null +++ b/checkpoint-200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:045043b7647cb23fd0c5f157aeab975633f4988068c700133c9e818bd7d23acc +size 14960 diff --git a/checkpoint-200/rng_state_2.pth b/checkpoint-200/rng_state_2.pth new file mode 100644 index 0000000..ef24bc9 --- /dev/null +++ b/checkpoint-200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75b318b1607e2d2d058b7cae62ad715b10b7734cab165019ee7faeb90fa8f9cb +size 14960 diff --git a/checkpoint-200/rng_state_3.pth b/checkpoint-200/rng_state_3.pth new file mode 100644 index 0000000..2df3ea6 --- /dev/null +++ b/checkpoint-200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0bd846d0d4a459d36cd4ee58b207443fa2c1b79a1c4a8df8fedfff7f31c370c +size 14960 diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt new file mode 100644 index 0000000..fdb5a84 --- /dev/null +++ b/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729ccc5c1effddf89d086a25cf24ed6a75e431dc3254d66b666c6a9c32393455 +size 1064 diff --git a/checkpoint-200/special_tokens_map.json b/checkpoint-200/special_tokens_map.json new file mode 100644 index 0000000..1d385d6 --- /dev/null +++ b/checkpoint-200/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-200/tokenizer.json b/checkpoint-200/tokenizer.json new file mode 100644 index 0000000..e7cd2c1 --- /dev/null +++ b/checkpoint-200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a +size 11422959 diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json new file mode 100644 index 0000000..ef6e98c --- /dev/null +++ b/checkpoint-200/tokenizer_config.json @@ -0,0 +1,195 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json new file mode 100644 index 0000000..dd9a905 --- /dev/null +++ b/checkpoint-200/trainer_state.json @@ -0,0 +1,5434 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.22857142857142856, + "eval_steps": 500, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1734.0, + "completions/mean_length": 1702.03125, + "completions/mean_terminated_length": 993.6190795898438, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.001142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2544386684894562, + "learning_rate": 0.0, + "loss": -0.0, + "num_tokens": 118418.0, + "reward": 0.17899775505065918, + "reward_std": 0.7650213241577148, + "rewards/cosine_scaled_reward/mean": -0.09800112992525101, + "rewards/cosine_scaled_reward/std": 0.37953105568885803, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.48795005679130554, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1894.0, + "completions/mean_length": 1738.90625, + "completions/mean_terminated_length": 949.0, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.002285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2436082512140274, + "learning_rate": 5e-08, + "loss": -0.0, + "num_tokens": 239748.0, + "reward": 0.3848632574081421, + "reward_std": 0.9111153483390808, + "rewards/cosine_scaled_reward/mean": 0.020556632429361343, + "rewards/cosine_scaled_reward/std": 0.4492928683757782, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1545.0, + "completions/mean_length": 1989.015625, + "completions/mean_terminated_length": 1104.25, + "completions/min_length": 706.0, + "completions/min_terminated_length": 706.0, + "epoch": 0.0034285714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2544717788696289, + "learning_rate": 1e-07, + "loss": -0.0, + "num_tokens": 377517.0, + "reward": -0.3279358148574829, + "reward_std": 0.33216947317123413, + "rewards/cosine_scaled_reward/mean": -0.20303040742874146, + "rewards/cosine_scaled_reward/std": 0.179075226187706, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.27048972249031067, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1566.421875, + "completions/mean_terminated_length": 1084.84375, + "completions/min_length": 502.0, + "completions/min_terminated_length": 502.0, + "epoch": 0.004571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28807103633880615, + "learning_rate": 1.5e-07, + "loss": -0.0, + "num_tokens": 487576.0, + "reward": 0.2716121971607208, + "reward_std": 0.6643469333648682, + "rewards/cosine_scaled_reward/mean": -0.12981891632080078, + "rewards/cosine_scaled_reward/std": 0.3019586503505707, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.5029674172401428, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1807.0, + "completions/mean_length": 1936.84375, + "completions/mean_terminated_length": 1031.71435546875, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "epoch": 0.005714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26783761382102966, + "learning_rate": 2e-07, + "loss": -0.0, + "num_tokens": 622350.0, + "reward": -0.3612896800041199, + "reward_std": 0.41048353910446167, + "rewards/cosine_scaled_reward/mean": -0.23533234000205994, + "rewards/cosine_scaled_reward/std": 0.20467400550842285, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.3145764470100403, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1301.0, + "completions/mean_length": 1889.453125, + "completions/mean_terminated_length": 779.625, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "epoch": 0.006857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.262518972158432, + "learning_rate": 2.5e-07, + "loss": 0.0, + "num_tokens": 754923.0, + "reward": -0.29250282049179077, + "reward_std": 0.5422531962394714, + "rewards/cosine_scaled_reward/mean": -0.22437641024589539, + "rewards/cosine_scaled_reward/std": 0.22509199380874634, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36596253514289856, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1568.0, + "completions/mean_length": 1921.921875, + "completions/mean_terminated_length": 1314.45458984375, + "completions/min_length": 927.0, + "completions/min_terminated_length": 927.0, + "epoch": 0.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22601397335529327, + "learning_rate": 3e-07, + "loss": 0.0, + "num_tokens": 888334.0, + "reward": 0.025340259075164795, + "reward_std": 0.7285393476486206, + "rewards/cosine_scaled_reward/mean": -0.1279548704624176, + "rewards/cosine_scaled_reward/std": 0.40222346782684326, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1736.859375, + "completions/mean_terminated_length": 999.9473876953125, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.009142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24552854895591736, + "learning_rate": 3.5e-07, + "loss": 0.0, + "num_tokens": 1009909.0, + "reward": 0.21729671955108643, + "reward_std": 0.6989120244979858, + "rewards/cosine_scaled_reward/mean": -0.055414143949747086, + "rewards/cosine_scaled_reward/std": 0.47493892908096313, + "rewards/format_reward/mean": 0.328125, + "rewards/format_reward/std": 0.4732423722743988, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 1967.53125, + "completions/mean_terminated_length": 1475.77783203125, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "epoch": 0.010285714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2430322915315628, + "learning_rate": 4e-07, + "loss": 0.0, + "num_tokens": 1147287.0, + "reward": -0.21451422572135925, + "reward_std": 0.587526798248291, + "rewards/cosine_scaled_reward/mean": -0.19319462776184082, + "rewards/cosine_scaled_reward/std": 0.29357606172561646, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.38025420904159546, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 1708.546875, + "completions/mean_terminated_length": 961.75, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.011428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2543582320213318, + "learning_rate": 4.5e-07, + "loss": 0.0, + "num_tokens": 1267466.0, + "reward": 0.02539752423763275, + "reward_std": 0.545810341835022, + "rewards/cosine_scaled_reward/mean": -0.14355123043060303, + "rewards/cosine_scaled_reward/std": 0.36147356033325195, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.467176616191864, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1579.0, + "completions/mean_length": 1967.734375, + "completions/mean_terminated_length": 1191.8333740234375, + "completions/min_length": 843.0, + "completions/min_terminated_length": 843.0, + "epoch": 0.012571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24583907425403595, + "learning_rate": 5e-07, + "loss": -0.0, + "num_tokens": 1405073.0, + "reward": -0.46971434354782104, + "reward_std": 0.36104393005371094, + "rewards/cosine_scaled_reward/mean": -0.28173214197158813, + "rewards/cosine_scaled_reward/std": 0.17775526642799377, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29378482699394226, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1862.0, + "completions/mean_length": 1707.5625, + "completions/mean_terminated_length": 1176.47998046875, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.013714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3135142922401428, + "learning_rate": 5.5e-07, + "loss": -0.0, + "num_tokens": 1525301.0, + "reward": 0.0018395520746707916, + "reward_std": 0.7012988328933716, + "rewards/cosine_scaled_reward/mean": -0.21783021092414856, + "rewards/cosine_scaled_reward/std": 0.324150949716568, + "rewards/format_reward/mean": 0.4375, + "rewards/format_reward/std": 0.5, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1745.0, + "completions/mean_length": 1841.96875, + "completions/mean_terminated_length": 1168.933349609375, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "epoch": 0.014857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2532394826412201, + "learning_rate": 6e-07, + "loss": -0.0, + "num_tokens": 1654227.0, + "reward": -0.10322706401348114, + "reward_std": 0.6915165185928345, + "rewards/cosine_scaled_reward/mean": -0.17661353945732117, + "rewards/cosine_scaled_reward/std": 0.329875111579895, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1816.390625, + "completions/mean_terminated_length": 1306.8499755859375, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "epoch": 0.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28405147790908813, + "learning_rate": 6.5e-07, + "loss": 0.0, + "num_tokens": 1781084.0, + "reward": 0.10602855682373047, + "reward_std": 0.630502462387085, + "rewards/cosine_scaled_reward/mean": -0.11104822158813477, + "rewards/cosine_scaled_reward/std": 0.3846627473831177, + "rewards/format_reward/mean": 0.328125, + "rewards/format_reward/std": 0.4732423722743988, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1574.0, + "completions/mean_length": 1702.109375, + "completions/mean_terminated_length": 818.1666870117188, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.017142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28779250383377075, + "learning_rate": 7e-07, + "loss": 0.0, + "num_tokens": 1900939.0, + "reward": 0.32734519243240356, + "reward_std": 0.3870265483856201, + "rewards/cosine_scaled_reward/mean": 0.007422588765621185, + "rewards/cosine_scaled_reward/std": 0.45787373185157776, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.467176616191864, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.018285714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2337152510881424, + "learning_rate": 7.5e-07, + "loss": -0.0, + "num_tokens": 2042451.0, + "reward": -0.5429925918579102, + "reward_std": 0.3153150975704193, + "rewards/cosine_scaled_reward/mean": -0.2714962661266327, + "rewards/cosine_scaled_reward/std": 0.1678173691034317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1879.0, + "completions/mean_length": 1564.921875, + "completions/mean_terminated_length": 858.8846435546875, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.019428571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33599403500556946, + "learning_rate": 8e-07, + "loss": -0.0, + "num_tokens": 2153126.0, + "reward": 0.17696775496006012, + "reward_std": 0.6489306688308716, + "rewards/cosine_scaled_reward/mean": -0.11464111506938934, + "rewards/cosine_scaled_reward/std": 0.3551919758319855, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.49501484632492065, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1298.0, + "completions/mean_length": 1795.390625, + "completions/mean_terminated_length": 893.21435546875, + "completions/min_length": 619.0, + "completions/min_terminated_length": 619.0, + "epoch": 0.02057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22697053849697113, + "learning_rate": 8.499999999999999e-07, + "loss": -0.0, + "num_tokens": 2278407.0, + "reward": -0.10711958259344101, + "reward_std": 0.5238703489303589, + "rewards/cosine_scaled_reward/mean": -0.1785597801208496, + "rewards/cosine_scaled_reward/std": 0.2545098662376404, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 1921.484375, + "completions/mean_terminated_length": 1238.300048828125, + "completions/min_length": 623.0, + "completions/min_terminated_length": 623.0, + "epoch": 0.021714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23972108960151672, + "learning_rate": 9e-07, + "loss": 0.0, + "num_tokens": 2412638.0, + "reward": 0.029344379901885986, + "reward_std": 0.6719281077384949, + "rewards/cosine_scaled_reward/mean": -0.086890310049057, + "rewards/cosine_scaled_reward/std": 0.40220555663108826, + "rewards/format_reward/mean": 0.203125, + "rewards/format_reward/std": 0.40550529956817627, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1728.5625, + "completions/mean_terminated_length": 845.4117431640625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.022857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23309311270713806, + "learning_rate": 9.499999999999999e-07, + "loss": 0.0, + "num_tokens": 2534618.0, + "reward": 0.0131673663854599, + "reward_std": 0.4436222314834595, + "rewards/cosine_scaled_reward/mean": -0.13404130935668945, + "rewards/cosine_scaled_reward/std": 0.32819250226020813, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 1777.953125, + "completions/mean_terminated_length": 1087.8333740234375, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29990270733833313, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 2659215.0, + "reward": -0.1764472872018814, + "reward_std": 0.5121938586235046, + "rewards/cosine_scaled_reward/mean": -0.2444736361503601, + "rewards/cosine_scaled_reward/std": 0.289971262216568, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.467176616191864, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 1361.28125, + "completions/mean_terminated_length": 921.0769653320312, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.025142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29922786355018616, + "learning_rate": 9.99931462820376e-07, + "loss": -0.0, + "num_tokens": 2755353.0, + "reward": 0.6089149713516235, + "reward_std": 0.5986809730529785, + "rewards/cosine_scaled_reward/mean": -0.05491749942302704, + "rewards/cosine_scaled_reward/std": 0.39076483249664307, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4531635046005249, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1565.046875, + "completions/mean_terminated_length": 903.2222290039062, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.026285714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27512773871421814, + "learning_rate": 9.997258721585931e-07, + "loss": -0.0, + "num_tokens": 2866308.0, + "reward": 0.21871733665466309, + "reward_std": 0.5976030826568604, + "rewards/cosine_scaled_reward/mean": -0.10157884657382965, + "rewards/cosine_scaled_reward/std": 0.3856185972690582, + "rewards/format_reward/mean": 0.421875, + "rewards/format_reward/std": 0.49776285886764526, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 1801.671875, + "completions/mean_terminated_length": 1259.75, + "completions/min_length": 573.0, + "completions/min_terminated_length": 573.0, + "epoch": 0.027428571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22642865777015686, + "learning_rate": 9.993832906395582e-07, + "loss": -0.0, + "num_tokens": 2992543.0, + "reward": 0.04899948835372925, + "reward_std": 0.8525694608688354, + "rewards/cosine_scaled_reward/mean": -0.17081275582313538, + "rewards/cosine_scaled_reward/std": 0.3993513882160187, + "rewards/format_reward/mean": 0.390625, + "rewards/format_reward/std": 0.4917473793029785, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1715.765625, + "completions/mean_terminated_length": 1035.4761962890625, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.02857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25316134095191956, + "learning_rate": 9.989038226169207e-07, + "loss": -0.0, + "num_tokens": 3112648.0, + "reward": 0.10585837811231613, + "reward_std": 0.7828943729400635, + "rewards/cosine_scaled_reward/mean": -0.11894579976797104, + "rewards/cosine_scaled_reward/std": 0.4141720235347748, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 1917.703125, + "completions/mean_terminated_length": 1452.357177734375, + "completions/min_length": 840.0, + "completions/min_terminated_length": 840.0, + "epoch": 0.029714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2521306574344635, + "learning_rate": 9.982876141412855e-07, + "loss": -0.0, + "num_tokens": 3246013.0, + "reward": 0.17620250582695007, + "reward_std": 0.6548349857330322, + "rewards/cosine_scaled_reward/mean": -0.08377375453710556, + "rewards/cosine_scaled_reward/std": 0.3527655303478241, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1851.015625, + "completions/mean_terminated_length": 1147.5, + "completions/min_length": 506.0, + "completions/min_terminated_length": 506.0, + "epoch": 0.030857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2730060815811157, + "learning_rate": 9.975348529157229e-07, + "loss": -0.0, + "num_tokens": 3374766.0, + "reward": -0.18854813277721405, + "reward_std": 0.49348777532577515, + "rewards/cosine_scaled_reward/mean": -0.21146157383918762, + "rewards/cosine_scaled_reward/std": 0.2601618766784668, + "rewards/format_reward/mean": 0.234375, + "rewards/format_reward/std": 0.42695629596710205, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1798.328125, + "completions/mean_terminated_length": 1049.3125, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2566036880016327, + "learning_rate": 9.96645768238595e-07, + "loss": 0.0, + "num_tokens": 3500195.0, + "reward": 0.06705980002880096, + "reward_std": 0.7090284824371338, + "rewards/cosine_scaled_reward/mean": -0.10709509253501892, + "rewards/cosine_scaled_reward/std": 0.4101051986217499, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1930.203125, + "completions/mean_terminated_length": 1210.3333740234375, + "completions/min_length": 582.0, + "completions/min_terminated_length": 582.0, + "epoch": 0.03314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25197461247444153, + "learning_rate": 9.956206309337066e-07, + "loss": 0.0, + "num_tokens": 3634200.0, + "reward": -0.2462695688009262, + "reward_std": 0.5237302780151367, + "rewards/cosine_scaled_reward/mean": -0.2012597918510437, + "rewards/cosine_scaled_reward/std": 0.23252712190151215, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36596253514289856, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1900.0, + "completions/mean_length": 1847.65625, + "completions/mean_terminated_length": 1061.6923828125, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "epoch": 0.03428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30431485176086426, + "learning_rate": 9.944597532678119e-07, + "loss": 0.0, + "num_tokens": 3762986.0, + "reward": -0.05392302945256233, + "reward_std": 0.7249555587768555, + "rewards/cosine_scaled_reward/mean": -0.15196150541305542, + "rewards/cosine_scaled_reward/std": 0.34566983580589294, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1860.0, + "completions/mean_length": 1838.671875, + "completions/mean_terminated_length": 931.5833740234375, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.03542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2484513372182846, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0, + "num_tokens": 3891157.0, + "reward": -0.11271396279335022, + "reward_std": 0.6705260872840881, + "rewards/cosine_scaled_reward/mean": -0.1813569962978363, + "rewards/cosine_scaled_reward/std": 0.4071698486804962, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1715.0, + "completions/mean_length": 1910.109375, + "completions/mean_terminated_length": 1417.6429443359375, + "completions/min_length": 906.0, + "completions/min_terminated_length": 906.0, + "epoch": 0.036571428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25329527258872986, + "learning_rate": 9.917322325514487e-07, + "loss": -0.0, + "num_tokens": 4023756.0, + "reward": -0.08931556344032288, + "reward_std": 0.6381070613861084, + "rewards/cosine_scaled_reward/mean": -0.16965776681900024, + "rewards/cosine_scaled_reward/std": 0.37385129928588867, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1865.0, + "completions/mean_length": 2023.71875, + "completions/mean_terminated_length": 1530.0, + "completions/min_length": 1107.0, + "completions/min_terminated_length": 1107.0, + "epoch": 0.037714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22758109867572784, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0, + "num_tokens": 4164490.0, + "reward": -0.4589868187904358, + "reward_std": 0.5177067518234253, + "rewards/cosine_scaled_reward/mean": -0.2919934093952179, + "rewards/cosine_scaled_reward/std": 0.2252870500087738, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3333333432674408, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1454.78125, + "completions/mean_terminated_length": 963.2571411132812, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.038857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3234354257583618, + "learning_rate": 9.88466529153356e-07, + "loss": 0.0, + "num_tokens": 4267148.0, + "reward": 0.656031608581543, + "reward_std": 0.7529654502868652, + "rewards/cosine_scaled_reward/mean": 0.05457830801606178, + "rewards/cosine_scaled_reward/std": 0.49684229493141174, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.501733124256134, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1724.0, + "completions/mean_length": 1819.078125, + "completions/mean_terminated_length": 716.0909423828125, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "epoch": 0.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2821458876132965, + "learning_rate": 9.866330768241983e-07, + "loss": -0.0, + "num_tokens": 4395065.0, + "reward": -0.09630556404590607, + "reward_std": 0.7089139223098755, + "rewards/cosine_scaled_reward/mean": -0.15752778947353363, + "rewards/cosine_scaled_reward/std": 0.3647947609424591, + "rewards/format_reward/mean": 0.21875, + "rewards/format_reward/std": 0.4166666865348816, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1811.0, + "completions/mean_length": 1954.34375, + "completions/mean_terminated_length": 1382.0, + "completions/min_length": 949.0, + "completions/min_terminated_length": 949.0, + "epoch": 0.04114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24163897335529327, + "learning_rate": 9.846666218300807e-07, + "loss": -0.0, + "num_tokens": 4531255.0, + "reward": -0.34593287110328674, + "reward_std": 0.44493502378463745, + "rewards/cosine_scaled_reward/mean": -0.24327893555164337, + "rewards/cosine_scaled_reward/std": 0.24784433841705322, + "rewards/format_reward/mean": 0.140625, + "rewards/format_reward/std": 0.3503824472427368, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1723.0, + "completions/mean_length": 1868.921875, + "completions/mean_terminated_length": 1092.916748046875, + "completions/min_length": 620.0, + "completions/min_terminated_length": 620.0, + "epoch": 0.04228571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24795544147491455, + "learning_rate": 9.825677631722435e-07, + "loss": -0.0, + "num_tokens": 4661890.0, + "reward": -0.23053905367851257, + "reward_std": 0.34036368131637573, + "rewards/cosine_scaled_reward/mean": -0.2246445268392563, + "rewards/cosine_scaled_reward/std": 0.15942412614822388, + "rewards/format_reward/mean": 0.21875, + "rewards/format_reward/std": 0.4166666865348816, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1397.0, + "completions/mean_length": 1889.53125, + "completions/mean_terminated_length": 1033.800048828125, + "completions/min_length": 810.0, + "completions/min_terminated_length": 810.0, + "epoch": 0.04342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24283826351165771, + "learning_rate": 9.80337140183366e-07, + "loss": 0.0, + "num_tokens": 4794532.0, + "reward": -0.10043507814407349, + "reward_std": 0.47925832867622375, + "rewards/cosine_scaled_reward/mean": -0.13615503907203674, + "rewards/cosine_scaled_reward/std": 0.3336707651615143, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.38025420904159546, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1515.0, + "completions/mean_length": 1644.828125, + "completions/mean_terminated_length": 689.9473876953125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.044571428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28362998366355896, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0, + "num_tokens": 4910585.0, + "reward": 0.12284853309392929, + "reward_std": 0.4183085858821869, + "rewards/cosine_scaled_reward/mean": -0.11045074462890625, + "rewards/cosine_scaled_reward/std": 0.30217844247817993, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1632.0, + "completions/mean_length": 1618.28125, + "completions/mean_terminated_length": 902.0833740234375, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.045714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.262617826461792, + "learning_rate": 9.754833590196926e-07, + "loss": 0.0, + "num_tokens": 5024227.0, + "reward": 0.2076582908630371, + "reward_std": 0.42125773429870605, + "rewards/cosine_scaled_reward/mean": -0.12273336946964264, + "rewards/cosine_scaled_reward/std": 0.4404613971710205, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.501733124256134, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1914.0, + "completions/mean_length": 1717.734375, + "completions/mean_terminated_length": 1235.0384521484375, + "completions/min_length": 664.0, + "completions/min_terminated_length": 664.0, + "epoch": 0.046857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23294499516487122, + "learning_rate": 9.728616793536587e-07, + "loss": -0.0, + "num_tokens": 5145314.0, + "reward": 0.011502981185913086, + "reward_std": 0.6816084980964661, + "rewards/cosine_scaled_reward/mean": -0.22081100940704346, + "rewards/cosine_scaled_reward/std": 0.37589573860168457, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.501733124256134, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1672.0, + "completions/mean_length": 1703.921875, + "completions/mean_terminated_length": 579.933349609375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34672290086746216, + "learning_rate": 9.701111919237408e-07, + "loss": -0.0, + "num_tokens": 5264725.0, + "reward": -0.2616002857685089, + "reward_std": 0.37952175736427307, + "rewards/cosine_scaled_reward/mean": -0.26361262798309326, + "rewards/cosine_scaled_reward/std": 0.17531204223632812, + "rewards/format_reward/mean": 0.265625, + "rewards/format_reward/std": 0.44515693187713623, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1370.0, + "completions/mean_length": 1681.84375, + "completions/mean_terminated_length": 814.631591796875, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.04914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.263967901468277, + "learning_rate": 9.672327345550543e-07, + "loss": -0.0, + "num_tokens": 5383979.0, + "reward": 0.13376155495643616, + "reward_std": 0.46012288331985474, + "rewards/cosine_scaled_reward/mean": -0.08155670762062073, + "rewards/cosine_scaled_reward/std": 0.3612325191497803, + "rewards/format_reward/mean": 0.296875, + "rewards/format_reward/std": 0.4604927599430084, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 1624.625, + "completions/mean_terminated_length": 869.9130859375, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.05028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28927963972091675, + "learning_rate": 9.64227184053598e-07, + "loss": -0.0, + "num_tokens": 5498651.0, + "reward": 0.20869271457195282, + "reward_std": 0.5558150410652161, + "rewards/cosine_scaled_reward/mean": -0.0987786278128624, + "rewards/cosine_scaled_reward/std": 0.42912590503692627, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.49501484632492065, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1851.0, + "completions/mean_length": 2006.96875, + "completions/mean_terminated_length": 1522.800048828125, + "completions/min_length": 955.0, + "completions/min_terminated_length": 955.0, + "epoch": 0.05142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24254000186920166, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0, + "num_tokens": 5638753.0, + "reward": -0.2540697157382965, + "reward_std": 0.4600578844547272, + "rewards/cosine_scaled_reward/mean": -0.20515984296798706, + "rewards/cosine_scaled_reward/std": 0.3251590430736542, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36596253514289856, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1563.0, + "completions/mean_length": 1765.984375, + "completions/mean_terminated_length": 919.9375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "epoch": 0.052571428571428575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2645930349826813, + "learning_rate": 9.578385041664925e-07, + "loss": 0.0, + "num_tokens": 5762944.0, + "reward": -0.213707834482193, + "reward_std": 0.38778313994407654, + "rewards/cosine_scaled_reward/mean": -0.2318539321422577, + "rewards/cosine_scaled_reward/std": 0.21436986327171326, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1583.40625, + "completions/mean_terminated_length": 986.0714721679688, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.053714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.311797559261322, + "learning_rate": 9.54457320834625e-07, + "loss": 0.0, + "num_tokens": 5874682.0, + "reward": 0.27925533056259155, + "reward_std": 0.6467443704605103, + "rewards/cosine_scaled_reward/mean": -0.07912233471870422, + "rewards/cosine_scaled_reward/std": 0.4737093150615692, + "rewards/format_reward/mean": 0.4375, + "rewards/format_reward/std": 0.5, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 1690.0625, + "completions/mean_terminated_length": 1006.727294921875, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.054857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26644304394721985, + "learning_rate": 9.509529358847654e-07, + "loss": -0.0, + "num_tokens": 5993390.0, + "reward": 0.13692031800746918, + "reward_std": 0.5655145049095154, + "rewards/cosine_scaled_reward/mean": -0.12685233354568481, + "rewards/cosine_scaled_reward/std": 0.32320985198020935, + "rewards/format_reward/mean": 0.390625, + "rewards/format_reward/std": 0.4917473793029785, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1387.140625, + "completions/mean_terminated_length": 804.0294189453125, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3078882396221161, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0, + "num_tokens": 6092231.0, + "reward": 0.35559189319610596, + "reward_std": 0.5927403569221497, + "rewards/cosine_scaled_reward/mean": -0.09564155340194702, + "rewards/cosine_scaled_reward/std": 0.4046906530857086, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.501733124256134, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 1674.890625, + "completions/mean_terminated_length": 962.5909423828125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.05714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23925544321537018, + "learning_rate": 9.43578868212728e-07, + "loss": -0.0, + "num_tokens": 6210240.0, + "reward": 0.18573230504989624, + "reward_std": 0.5264967083930969, + "rewards/cosine_scaled_reward/mean": -0.09463384002447128, + "rewards/cosine_scaled_reward/std": 0.4100942015647888, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.48795005679130554, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1347.40625, + "completions/mean_terminated_length": 836.1621704101562, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.05828571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.325811505317688, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0, + "num_tokens": 6306682.0, + "reward": 0.1735648661851883, + "reward_std": 0.5335988998413086, + "rewards/cosine_scaled_reward/mean": -0.21009255945682526, + "rewards/cosine_scaled_reward/std": 0.2623959481716156, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.49501484632492065, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1390.0, + "completions/mean_length": 1727.765625, + "completions/mean_terminated_length": 767.0625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.05942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27392977476119995, + "learning_rate": 9.357252853159505e-07, + "loss": 0.0, + "num_tokens": 6428611.0, + "reward": -0.16267812252044678, + "reward_std": 0.5682471990585327, + "rewards/cosine_scaled_reward/mean": -0.2219640612602234, + "rewards/cosine_scaled_reward/std": 0.36739134788513184, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1907.0, + "completions/mean_length": 1609.171875, + "completions/mean_terminated_length": 924.5999755859375, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "epoch": 0.060571428571428575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28155064582824707, + "learning_rate": 9.316216432703916e-07, + "loss": -0.0, + "num_tokens": 6542430.0, + "reward": 0.0752667784690857, + "reward_std": 0.7118167281150818, + "rewards/cosine_scaled_reward/mean": -0.18892911076545715, + "rewards/cosine_scaled_reward/std": 0.3222156763076782, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.501733124256134, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1588.234375, + "completions/mean_terminated_length": 1067.166748046875, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "epoch": 0.061714285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2555343806743622, + "learning_rate": 9.274017555754407e-07, + "loss": 0.0, + "num_tokens": 6655221.0, + "reward": 0.6341299414634705, + "reward_std": 1.0656921863555908, + "rewards/cosine_scaled_reward/mean": 0.05143994837999344, + "rewards/cosine_scaled_reward/std": 0.5348308086395264, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.5029674172401428, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1420.0, + "completions/mean_length": 1549.5625, + "completions/mean_terminated_length": 821.0769653320312, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "epoch": 0.06285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30243629217147827, + "learning_rate": 9.230669076497687e-07, + "loss": -0.0, + "num_tokens": 6764681.0, + "reward": 0.13021975755691528, + "reward_std": 0.3984764516353607, + "rewards/cosine_scaled_reward/mean": -0.13801513612270355, + "rewards/cosine_scaled_reward/std": 0.41228073835372925, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.49501484632492065, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1633.25, + "completions/mean_terminated_length": 1132.689697265625, + "completions/min_length": 543.0, + "completions/min_terminated_length": 543.0, + "epoch": 0.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23835402727127075, + "learning_rate": 9.186184199300463e-07, + "loss": -0.0, + "num_tokens": 6880169.0, + "reward": 0.27981996536254883, + "reward_std": 0.5018116235733032, + "rewards/cosine_scaled_reward/mean": -0.10227750986814499, + "rewards/cosine_scaled_reward/std": 0.481824666261673, + "rewards/format_reward/mean": 0.484375, + "rewards/format_reward/std": 0.5037065148353577, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1807.0, + "completions/mean_length": 1699.875, + "completions/mean_terminated_length": 1156.7999267578125, + "completions/min_length": 642.0, + "completions/min_terminated_length": 642.0, + "epoch": 0.06514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22349494695663452, + "learning_rate": 9.140576474687263e-07, + "loss": 0.0, + "num_tokens": 7000529.0, + "reward": -0.026505012065172195, + "reward_std": 0.5785415172576904, + "rewards/cosine_scaled_reward/mean": -0.20856501162052155, + "rewards/cosine_scaled_reward/std": 0.2749907374382019, + "rewards/format_reward/mean": 0.390625, + "rewards/format_reward/std": 0.4917473793029785, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1457.875, + "completions/mean_terminated_length": 1054.105224609375, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "epoch": 0.06628571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.261942595243454, + "learning_rate": 9.093859795212817e-07, + "loss": 0.0, + "num_tokens": 7103929.0, + "reward": 0.5745843648910522, + "reward_std": 0.8671218156814575, + "rewards/cosine_scaled_reward/mean": -0.03302033245563507, + "rewards/cosine_scaled_reward/std": 0.45529407262802124, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4836103618144989, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1590.0625, + "completions/mean_terminated_length": 1159.8787841796875, + "completions/min_length": 591.0, + "completions/min_terminated_length": 591.0, + "epoch": 0.06742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24828943610191345, + "learning_rate": 9.046048391230247e-07, + "loss": -0.0, + "num_tokens": 7216157.0, + "reward": 0.3377103805541992, + "reward_std": 0.5543617010116577, + "rewards/cosine_scaled_reward/mean": -0.1045822948217392, + "rewards/cosine_scaled_reward/std": 0.39040952920913696, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.501733124256134, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 1622.84375, + "completions/mean_terminated_length": 1076.21435546875, + "completions/min_length": 555.0, + "completions/min_terminated_length": 555.0, + "epoch": 0.06857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2752656936645508, + "learning_rate": 8.997156826556369e-07, + "loss": -0.0, + "num_tokens": 7330907.0, + "reward": 0.11114693433046341, + "reward_std": 0.6926254034042358, + "rewards/cosine_scaled_reward/mean": -0.1788015365600586, + "rewards/cosine_scaled_reward/std": 0.39409172534942627, + "rewards/format_reward/mean": 0.46875, + "rewards/format_reward/std": 0.5029674172401428, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1708.859375, + "completions/mean_terminated_length": 1014.4285888671875, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.06971428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22669929265975952, + "learning_rate": 8.9471999940354e-07, + "loss": -0.0, + "num_tokens": 7451794.0, + "reward": 0.2345120906829834, + "reward_std": 0.6293160319328308, + "rewards/cosine_scaled_reward/mean": -0.1093064472079277, + "rewards/cosine_scaled_reward/std": 0.29189831018447876, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.501733124256134, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1281.53125, + "completions/mean_terminated_length": 1004.2978515625, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.07085714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25438693165779114, + "learning_rate": 8.896193111002475e-07, + "loss": 0.0, + "num_tokens": 7544044.0, + "reward": 0.9180847406387329, + "reward_std": 0.6390912532806396, + "rewards/cosine_scaled_reward/mean": 0.06841734796762466, + "rewards/cosine_scaled_reward/std": 0.48315128684043884, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4166666865348816, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1678.0, + "completions/mean_length": 1310.46875, + "completions/mean_terminated_length": 896.731689453125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28795576095581055, + "learning_rate": 8.844151714648274e-07, + "loss": -0.0, + "num_tokens": 7638170.0, + "reward": 0.6488770246505737, + "reward_std": 0.7876260876655579, + "rewards/cosine_scaled_reward/mean": -0.019311510026454926, + "rewards/cosine_scaled_reward/std": 0.4736698865890503, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.467176616191864, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 1307.625, + "completions/mean_terminated_length": 1039.8297119140625, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.07314285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25637197494506836, + "learning_rate": 8.791091657286267e-07, + "loss": -0.0, + "num_tokens": 7732810.0, + "reward": 0.8280279636383057, + "reward_std": 0.6804471015930176, + "rewards/cosine_scaled_reward/mean": 0.015576483681797981, + "rewards/cosine_scaled_reward/std": 0.44819310307502747, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.40550529956817627, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1846.0, + "completions/mean_length": 1322.125, + "completions/mean_terminated_length": 914.9268188476562, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.07428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2944399118423462, + "learning_rate": 8.737029101523929e-07, + "loss": -0.0, + "num_tokens": 7828130.0, + "reward": 0.15610456466674805, + "reward_std": 0.4606686234474182, + "rewards/cosine_scaled_reward/mean": -0.24226020276546478, + "rewards/cosine_scaled_reward/std": 0.33131492137908936, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4836103618144989, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1803.0, + "completions/mean_length": 1020.21875, + "completions/mean_terminated_length": 806.9057006835938, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.07542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32644009590148926, + "learning_rate": 8.681980515339463e-07, + "loss": 0.0, + "num_tokens": 7903656.0, + "reward": 0.7972471714019775, + "reward_std": 0.7674820423126221, + "rewards/cosine_scaled_reward/mean": -0.031063925474882126, + "rewards/cosine_scaled_reward/std": 0.5106223225593567, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1910.0, + "completions/mean_length": 1750.859375, + "completions/mean_terminated_length": 1142.4285888671875, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "epoch": 0.07657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2270829975605011, + "learning_rate": 8.625962667065487e-07, + "loss": 0.0, + "num_tokens": 8026447.0, + "reward": -0.1400720775127411, + "reward_std": 0.3325888514518738, + "rewards/cosine_scaled_reward/mean": -0.24972353875637054, + "rewards/cosine_scaled_reward/std": 0.16404789686203003, + "rewards/format_reward/mean": 0.359375, + "rewards/format_reward/std": 0.4836103618144989, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1424.0, + "completions/mean_length": 769.546875, + "completions/mean_terminated_length": 637.2930908203125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.07771428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37025144696235657, + "learning_rate": 8.568992620281243e-07, + "loss": -0.0, + "num_tokens": 8084954.0, + "reward": 0.9792699813842773, + "reward_std": 0.804767370223999, + "rewards/cosine_scaled_reward/mean": 0.03651002421975136, + "rewards/cosine_scaled_reward/std": 0.46041443943977356, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1701.0, + "completions/mean_length": 1086.234375, + "completions/mean_terminated_length": 886.6226806640625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.07885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3763800263404846, + "learning_rate": 8.511087728614862e-07, + "loss": 0.0, + "num_tokens": 8164817.0, + "reward": 0.35803771018981934, + "reward_std": 0.5702667236328125, + "rewards/cosine_scaled_reward/mean": -0.24285613000392914, + "rewards/cosine_scaled_reward/std": 0.3019825220108032, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1898.0, + "completions/mean_length": 1463.375, + "completions/mean_terminated_length": 1112.5999755859375, + "completions/min_length": 503.0, + "completions/min_terminated_length": 503.0, + "epoch": 0.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24232418835163116, + "learning_rate": 8.452265630457282e-07, + "loss": -0.0, + "num_tokens": 8269929.0, + "reward": 0.3703588843345642, + "reward_std": 0.7288752794265747, + "rewards/cosine_scaled_reward/mean": -0.1351330280303955, + "rewards/cosine_scaled_reward/std": 0.3751916289329529, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4836103618144989, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1409.859375, + "completions/mean_terminated_length": 973.2368774414062, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.08114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.300010621547699, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0, + "num_tokens": 8370880.0, + "reward": 0.5196826457977295, + "reward_std": 0.7097917795181274, + "rewards/cosine_scaled_reward/mean": -0.044846177101135254, + "rewards/cosine_scaled_reward/std": 0.508389949798584, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4917473793029785, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1801.0, + "completions/mean_length": 1228.046875, + "completions/mean_terminated_length": 931.4680786132812, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.08228571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30454304814338684, + "learning_rate": 8.331941759724268e-07, + "loss": -0.0, + "num_tokens": 8459827.0, + "reward": 0.41365131735801697, + "reward_std": 0.5005639791488647, + "rewards/cosine_scaled_reward/mean": -0.1759868562221527, + "rewards/cosine_scaled_reward/std": 0.19868774712085724, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42695629596710205, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1719.0, + "completions/mean_length": 1513.28125, + "completions/mean_terminated_length": 1192.4500732421875, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "epoch": 0.08342857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27848970890045166, + "learning_rate": 8.270476638965461e-07, + "loss": -0.0, + "num_tokens": 8567405.0, + "reward": 0.09570223093032837, + "reward_std": 0.5445049405097961, + "rewards/cosine_scaled_reward/mean": -0.2802739143371582, + "rewards/cosine_scaled_reward/std": 0.25603488087654114, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4787135720252991, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1888.0, + "completions/mean_length": 1240.125, + "completions/mean_terminated_length": 924.0, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "epoch": 0.08457142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2789021134376526, + "learning_rate": 8.208167604184217e-07, + "loss": 0.0, + "num_tokens": 8656701.0, + "reward": 0.7823752760887146, + "reward_std": 0.6479132175445557, + "rewards/cosine_scaled_reward/mean": 0.031812600791454315, + "rewards/cosine_scaled_reward/std": 0.5397623181343079, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4531635046005249, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1455.953125, + "completions/mean_terminated_length": 1186.8409423828125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "epoch": 0.08571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22443196177482605, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0, + "num_tokens": 8760842.0, + "reward": 0.8040015697479248, + "reward_std": 0.5675323009490967, + "rewards/cosine_scaled_reward/mean": 0.027000809088349342, + "rewards/cosine_scaled_reward/std": 0.5096040964126587, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4364357888698578, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1720.0, + "completions/mean_length": 1177.859375, + "completions/mean_terminated_length": 863.1276245117188, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.08685714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32647648453712463, + "learning_rate": 8.081093963579707e-07, + "loss": 0.0, + "num_tokens": 8846625.0, + "reward": 0.310506671667099, + "reward_std": 0.5110941529273987, + "rewards/cosine_scaled_reward/mean": -0.2119341641664505, + "rewards/cosine_scaled_reward/std": 0.24737994372844696, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44515693187713623, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1787.0, + "completions/mean_length": 1263.4375, + "completions/mean_terminated_length": 1043.760009765625, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "epoch": 0.088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2545543611049652, + "learning_rate": 8.01636806561836e-07, + "loss": -0.0, + "num_tokens": 8939061.0, + "reward": 0.5484907031059265, + "reward_std": 0.48998576402664185, + "rewards/cosine_scaled_reward/mean": -0.13200464844703674, + "rewards/cosine_scaled_reward/std": 0.3430649936199188, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.39339789748191833, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1460.78125, + "completions/mean_terminated_length": 1059.0, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.08914285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2583931088447571, + "learning_rate": 7.950875657567621e-07, + "loss": 0.0, + "num_tokens": 9043271.0, + "reward": 0.6075442433357239, + "reward_std": 0.6895643472671509, + "rewards/cosine_scaled_reward/mean": -0.0009153857827186584, + "rewards/cosine_scaled_reward/std": 0.48922818899154663, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4917473793029785, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1956.0, + "completions/mean_length": 1054.875, + "completions/mean_terminated_length": 892.3635864257812, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.09028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29089078307151794, + "learning_rate": 7.884636689049422e-07, + "loss": 0.0, + "num_tokens": 9120879.0, + "reward": 0.6885831356048584, + "reward_std": 0.508629322052002, + "rewards/cosine_scaled_reward/mean": -0.09320840239524841, + "rewards/cosine_scaled_reward/std": 0.38835227489471436, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1399.046875, + "completions/mean_terminated_length": 1145.1087646484375, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "epoch": 0.09142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27458345890045166, + "learning_rate": 7.817671337095244e-07, + "loss": 0.0, + "num_tokens": 9220810.0, + "reward": 0.5549384355545044, + "reward_std": 0.7092134952545166, + "rewards/cosine_scaled_reward/mean": -0.09753081202507019, + "rewards/cosine_scaled_reward/std": 0.4125780463218689, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4364357888698578, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1833.0, + "completions/mean_length": 1084.984375, + "completions/mean_terminated_length": 906.6481323242188, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.09257142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37247684597969055, + "learning_rate": 7.75e-07, + "loss": -0.0, + "num_tokens": 9301521.0, + "reward": 0.5357480049133301, + "reward_std": 0.5661624670028687, + "rewards/cosine_scaled_reward/mean": -0.18525099754333496, + "rewards/cosine_scaled_reward/std": 0.3385297954082489, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1260.921875, + "completions/mean_terminated_length": 998.5625, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.09371428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27329322695732117, + "learning_rate": 7.681643291108517e-07, + "loss": -0.0, + "num_tokens": 9392548.0, + "reward": 0.9478914737701416, + "reward_std": 0.4313860237598419, + "rewards/cosine_scaled_reward/mean": 0.09894578158855438, + "rewards/cosine_scaled_reward/std": 0.5477120876312256, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4364357888698578, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1309.671875, + "completions/mean_terminated_length": 922.9285888671875, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.09485714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3202998638153076, + "learning_rate": 7.612622032536507e-07, + "loss": -0.0, + "num_tokens": 9487455.0, + "reward": 0.5201998949050903, + "reward_std": 0.6858996152877808, + "rewards/cosine_scaled_reward/mean": -0.09927503764629364, + "rewards/cosine_scaled_reward/std": 0.37909674644470215, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4531635046005249, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1685.0, + "completions/mean_length": 1185.703125, + "completions/mean_terminated_length": 965.9019775390625, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29646041989326477, + "learning_rate": 7.54295724882796e-07, + "loss": -0.0, + "num_tokens": 9574036.0, + "reward": 0.6779025793075562, + "reward_std": 0.557724118232727, + "rewards/cosine_scaled_reward/mean": -0.09073619544506073, + "rewards/cosine_scaled_reward/std": 0.3855368196964264, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1297.828125, + "completions/mean_terminated_length": 1158.907470703125, + "completions/min_length": 601.0, + "completions/min_terminated_length": 601.0, + "epoch": 0.09714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21307455003261566, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0, + "num_tokens": 9667417.0, + "reward": 0.5093189477920532, + "reward_std": 0.6006681323051453, + "rewards/cosine_scaled_reward/mean": -0.1672155261039734, + "rewards/cosine_scaled_reward/std": 0.34896284341812134, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1859.0, + "completions/mean_length": 1348.90625, + "completions/mean_terminated_length": 1096.04248046875, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "epoch": 0.09828571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2883393168449402, + "learning_rate": 7.401782177833147e-07, + "loss": -0.0, + "num_tokens": 9764603.0, + "reward": 0.8025823831558228, + "reward_std": 0.547119677066803, + "rewards/cosine_scaled_reward/mean": 0.01847870647907257, + "rewards/cosine_scaled_reward/std": 0.4346420168876648, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42695629596710205, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1782.0, + "completions/mean_length": 1086.96875, + "completions/mean_terminated_length": 909.0, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.09942857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31888866424560547, + "learning_rate": 7.330314893841101e-07, + "loss": -0.0, + "num_tokens": 9844289.0, + "reward": 0.5533354878425598, + "reward_std": 0.5319498777389526, + "rewards/cosine_scaled_reward/mean": -0.1530197560787201, + "rewards/cosine_scaled_reward/std": 0.2434682846069336, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 954.921875, + "completions/mean_terminated_length": 919.6612548828125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.10057142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3025936484336853, + "learning_rate": 7.258290078201731e-07, + "loss": -0.0, + "num_tokens": 9915916.0, + "reward": 1.2692296504974365, + "reward_std": 0.5115163326263428, + "rewards/cosine_scaled_reward/mean": 0.13461479544639587, + "rewards/cosine_scaled_reward/std": 0.506001353263855, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 1351.8125, + "completions/mean_terminated_length": 1174.35302734375, + "completions/min_length": 650.0, + "completions/min_terminated_length": 650.0, + "epoch": 0.10171428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23423585295677185, + "learning_rate": 7.185729670371604e-07, + "loss": -0.0, + "num_tokens": 10013432.0, + "reward": 0.724889874458313, + "reward_std": 0.7425336837768555, + "rewards/cosine_scaled_reward/mean": -0.0828675627708435, + "rewards/cosine_scaled_reward/std": 0.3893774449825287, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1906.0, + "completions/mean_length": 1153.28125, + "completions/mean_terminated_length": 1025.46435546875, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "epoch": 0.10285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3860023021697998, + "learning_rate": 7.11265577295385e-07, + "loss": -0.0, + "num_tokens": 10097242.0, + "reward": 0.5000253915786743, + "reward_std": 0.5103108286857605, + "rewards/cosine_scaled_reward/mean": -0.18748730421066284, + "rewards/cosine_scaled_reward/std": 0.2787182629108429, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1455.484375, + "completions/mean_terminated_length": 1166.1163330078125, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "epoch": 0.104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2551063895225525, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0, + "num_tokens": 10200961.0, + "reward": 0.4053259789943695, + "reward_std": 0.663999617099762, + "rewards/cosine_scaled_reward/mean": -0.18796202540397644, + "rewards/cosine_scaled_reward/std": 0.35777655243873596, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4166666865348816, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1176.953125, + "completions/mean_terminated_length": 1015.6481323242188, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "epoch": 0.10514285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27449366450309753, + "learning_rate": 6.965056695057204e-07, + "loss": -0.0, + "num_tokens": 10286278.0, + "reward": 0.5743436217308044, + "reward_std": 0.6229422092437744, + "rewards/cosine_scaled_reward/mean": -0.15032817423343658, + "rewards/cosine_scaled_reward/std": 0.2899566888809204, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1434.875, + "completions/mean_terminated_length": 1156.181884765625, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.10628571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2839376926422119, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0, + "num_tokens": 10389454.0, + "reward": 0.30658647418022156, + "reward_std": 0.5343226194381714, + "rewards/cosine_scaled_reward/mean": -0.22951926290988922, + "rewards/cosine_scaled_reward/std": 0.2324177473783493, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42695629596710205, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1684.0, + "completions/mean_length": 1242.390625, + "completions/mean_terminated_length": 927.1522216796875, + "completions/min_length": 508.0, + "completions/min_terminated_length": 508.0, + "epoch": 0.10742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2985072433948517, + "learning_rate": 6.815672671252315e-07, + "loss": 0.0, + "num_tokens": 10478735.0, + "reward": 0.6593698263168335, + "reward_std": 0.5845412015914917, + "rewards/cosine_scaled_reward/mean": -0.02969011664390564, + "rewards/cosine_scaled_reward/std": 0.47056320309638977, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4531635046005249, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1931.0, + "completions/mean_length": 1203.265625, + "completions/mean_terminated_length": 1082.58935546875, + "completions/min_length": 573.0, + "completions/min_terminated_length": 573.0, + "epoch": 0.10857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2689598798751831, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0, + "num_tokens": 10566272.0, + "reward": 0.4301251173019409, + "reward_std": 0.4795047640800476, + "rewards/cosine_scaled_reward/mean": -0.22243742644786835, + "rewards/cosine_scaled_reward/std": 0.2575407326221466, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1827.0, + "completions/mean_length": 1205.5625, + "completions/mean_terminated_length": 990.8235473632812, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.10971428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30502915382385254, + "learning_rate": 6.664685702961344e-07, + "loss": -0.0, + "num_tokens": 10654564.0, + "reward": 0.896080493927002, + "reward_std": 0.6987663507461548, + "rewards/cosine_scaled_reward/mean": 0.02616523765027523, + "rewards/cosine_scaled_reward/std": 0.460237056016922, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1777.0, + "completions/mean_length": 1170.390625, + "completions/mean_terminated_length": 988.2453002929688, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.11085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3103901743888855, + "learning_rate": 6.588648530198504e-07, + "loss": -0.0, + "num_tokens": 10739733.0, + "reward": 0.6633297204971313, + "reward_std": 0.609075665473938, + "rewards/cosine_scaled_reward/mean": -0.12927262485027313, + "rewards/cosine_scaled_reward/std": 0.4114542305469513, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1817.0, + "completions/mean_length": 1136.5625, + "completions/mean_terminated_length": 947.396240234375, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "epoch": 0.112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2510873079299927, + "learning_rate": 6.512279744547392e-07, + "loss": 0.0, + "num_tokens": 10823537.0, + "reward": 0.6613268256187439, + "reward_std": 0.4785424768924713, + "rewards/cosine_scaled_reward/mean": -0.09902409464120865, + "rewards/cosine_scaled_reward/std": 0.4345317482948303, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1171.8125, + "completions/mean_terminated_length": 1081.17236328125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.11314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.281054824590683, + "learning_rate": 6.435602608679916e-07, + "loss": -0.0, + "num_tokens": 10909701.0, + "reward": 1.0416245460510254, + "reward_std": 0.6949809789657593, + "rewards/cosine_scaled_reward/mean": 0.0520622618496418, + "rewards/cosine_scaled_reward/std": 0.508481502532959, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24397502839565277, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 1120.8125, + "completions/mean_terminated_length": 1024.8966064453125, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.11428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2910788655281067, + "learning_rate": 6.358640479194451e-07, + "loss": 0.0, + "num_tokens": 10991145.0, + "reward": 1.2036188840866089, + "reward_std": 0.8533884286880493, + "rewards/cosine_scaled_reward/mean": 0.14087192714214325, + "rewards/cosine_scaled_reward/std": 0.5375887751579285, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 1076.953125, + "completions/mean_terminated_length": 1029.1966552734375, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "epoch": 0.11542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33955609798431396, + "learning_rate": 6.281416799501187e-07, + "loss": 0.0, + "num_tokens": 11071502.0, + "reward": 0.7810705900192261, + "reward_std": 0.5973731279373169, + "rewards/cosine_scaled_reward/mean": -0.10165221989154816, + "rewards/cosine_scaled_reward/std": 0.4130260646343231, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 1092.078125, + "completions/mean_terminated_length": 935.654541015625, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.11657142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34537607431411743, + "learning_rate": 6.203955092681039e-07, + "loss": 0.0, + "num_tokens": 11151547.0, + "reward": 0.6441041231155396, + "reward_std": 0.53089839220047, + "rewards/cosine_scaled_reward/mean": -0.10763543844223022, + "rewards/cosine_scaled_reward/std": 0.39948928356170654, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1120.625, + "completions/mean_terminated_length": 1006.7368774414062, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.11771428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.343980997800827, + "learning_rate": 6.126278954320294e-07, + "loss": 0.0, + "num_tokens": 11233619.0, + "reward": 0.6925251483917236, + "reward_std": 0.5938367247581482, + "rewards/cosine_scaled_reward/mean": -0.13029994070529938, + "rewards/cosine_scaled_reward/std": 0.37749138474464417, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21304203569889069, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1913.0, + "completions/mean_length": 1120.359375, + "completions/mean_terminated_length": 948.5740966796875, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.11885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30854102969169617, + "learning_rate": 6.048412045323164e-07, + "loss": -0.0, + "num_tokens": 11315786.0, + "reward": 0.560060977935791, + "reward_std": 0.5216183662414551, + "rewards/cosine_scaled_reward/mean": -0.1418444812297821, + "rewards/cosine_scaled_reward/std": 0.33836889266967773, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1897.0, + "completions/mean_length": 1158.421875, + "completions/mean_terminated_length": 953.1346435546875, + "completions/min_length": 503.0, + "completions/min_terminated_length": 503.0, + "epoch": 0.12, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29531243443489075, + "learning_rate": 5.97037808470444e-07, + "loss": -0.0, + "num_tokens": 11401213.0, + "reward": 1.0410652160644531, + "reward_std": 0.7858219742774963, + "rewards/cosine_scaled_reward/mean": 0.09084508568048477, + "rewards/cosine_scaled_reward/std": 0.5061684250831604, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1867.0, + "completions/mean_length": 1045.859375, + "completions/mean_terminated_length": 837.867919921875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.12114285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26259294152259827, + "learning_rate": 5.892200842364462e-07, + "loss": -0.0, + "num_tokens": 11478980.0, + "reward": 1.0545225143432617, + "reward_std": 0.7633667588233948, + "rewards/cosine_scaled_reward/mean": 0.07413630187511444, + "rewards/cosine_scaled_reward/std": 0.48842984437942505, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1101.234375, + "completions/mean_terminated_length": 946.30908203125, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.12228571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3363504409790039, + "learning_rate": 5.813904131848564e-07, + "loss": 0.0, + "num_tokens": 11560611.0, + "reward": 0.648673415184021, + "reward_std": 0.6051540970802307, + "rewards/cosine_scaled_reward/mean": -0.11316327750682831, + "rewards/cosine_scaled_reward/std": 0.37149766087532043, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1857.0, + "completions/mean_length": 1225.28125, + "completions/mean_terminated_length": 1054.5283203125, + "completions/min_length": 515.0, + "completions/min_terminated_length": 515.0, + "epoch": 0.12342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2867675721645355, + "learning_rate": 5.735511803093248e-07, + "loss": 0.0, + "num_tokens": 11649389.0, + "reward": 0.560509204864502, + "reward_std": 0.6691359877586365, + "rewards/cosine_scaled_reward/mean": -0.14943289756774902, + "rewards/cosine_scaled_reward/std": 0.4461749494075775, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1227.203125, + "completions/mean_terminated_length": 1056.84912109375, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "epoch": 0.12457142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2772690951824188, + "learning_rate": 5.657047735161255e-07, + "loss": -0.0, + "num_tokens": 11739178.0, + "reward": 0.6980891227722168, + "reward_std": 0.624833345413208, + "rewards/cosine_scaled_reward/mean": -0.0650179386138916, + "rewards/cosine_scaled_reward/std": 0.41062912344932556, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38025420904159546, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1727.0, + "completions/mean_length": 1145.0, + "completions/mean_terminated_length": 914.8235473632812, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.12571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3468596637248993, + "learning_rate": 5.578535828967777e-07, + "loss": -0.0, + "num_tokens": 11823234.0, + "reward": 0.6972323656082153, + "reward_std": 0.5477026104927063, + "rewards/cosine_scaled_reward/mean": -0.08888379484415054, + "rewards/cosine_scaled_reward/std": 0.3565239906311035, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1969.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 977.046875, + "completions/mean_terminated_length": 977.046875, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.12685714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3180137574672699, + "learning_rate": 5.5e-07, + "loss": 0.0, + "num_tokens": 11895885.0, + "reward": 0.8744360208511353, + "reward_std": 0.5815237164497375, + "rewards/cosine_scaled_reward/mean": -0.06278196722269058, + "rewards/cosine_scaled_reward/std": 0.37791064381599426, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1954.0, + "completions/mean_length": 1269.421875, + "completions/mean_terminated_length": 1089.75, + "completions/min_length": 605.0, + "completions/min_terminated_length": 605.0, + "epoch": 0.128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2817465364933014, + "learning_rate": 5.421464171032224e-07, + "loss": -0.0, + "num_tokens": 11988224.0, + "reward": 0.9151681065559387, + "reward_std": 0.594943642616272, + "rewards/cosine_scaled_reward/mean": 0.02789657562971115, + "rewards/cosine_scaled_reward/std": 0.4965399205684662, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1910.0, + "completions/max_terminated_length": 1910.0, + "completions/mean_length": 934.578125, + "completions/mean_terminated_length": 934.578125, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.12914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3341560959815979, + "learning_rate": 5.342952264838747e-07, + "loss": -0.0, + "num_tokens": 12058333.0, + "reward": 1.0256879329681396, + "reward_std": 0.717230498790741, + "rewards/cosine_scaled_reward/mean": 0.02065650373697281, + "rewards/cosine_scaled_reward/std": 0.4963410794734955, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 1055.21875, + "completions/mean_terminated_length": 971.0847778320312, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.13028571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3800676763057709, + "learning_rate": 5.264488196906752e-07, + "loss": -0.0, + "num_tokens": 12135715.0, + "reward": 0.649993896484375, + "reward_std": 0.5865596532821655, + "rewards/cosine_scaled_reward/mean": -0.1750030517578125, + "rewards/cosine_scaled_reward/std": 0.3388007879257202, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 1169.671875, + "completions/mean_terminated_length": 987.3773803710938, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.13142857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3112519085407257, + "learning_rate": 5.186095868151436e-07, + "loss": 0.0, + "num_tokens": 12221790.0, + "reward": 0.7184536457061768, + "reward_std": 0.44992831349372864, + "rewards/cosine_scaled_reward/mean": -0.06264819949865341, + "rewards/cosine_scaled_reward/std": 0.44565486907958984, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1936.0, + "completions/mean_length": 1224.890625, + "completions/mean_terminated_length": 1072.4630126953125, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "epoch": 0.13257142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2884223461151123, + "learning_rate": 5.107799157635538e-07, + "loss": 0.0, + "num_tokens": 12311567.0, + "reward": 0.8372049927711487, + "reward_std": 0.608986496925354, + "rewards/cosine_scaled_reward/mean": -0.026710007339715958, + "rewards/cosine_scaled_reward/std": 0.4437602162361145, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 1078.65625, + "completions/mean_terminated_length": 1030.9835205078125, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "epoch": 0.1337142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3016076385974884, + "learning_rate": 5.02962191529556e-07, + "loss": -0.0, + "num_tokens": 12391625.0, + "reward": 0.8182538747787476, + "reward_std": 0.6463132500648499, + "rewards/cosine_scaled_reward/mean": -0.09087307006120682, + "rewards/cosine_scaled_reward/std": 0.3895137310028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 1226.046875, + "completions/mean_terminated_length": 952.0625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.13485714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2991194427013397, + "learning_rate": 4.951587954676837e-07, + "loss": 0.0, + "num_tokens": 12480628.0, + "reward": 0.6370267868041992, + "reward_std": 0.7525250911712646, + "rewards/cosine_scaled_reward/mean": -0.056486621499061584, + "rewards/cosine_scaled_reward/std": 0.44576171040534973, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4364357888698578, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1038.96875, + "completions/mean_terminated_length": 894.8214721679688, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4483291506767273, + "learning_rate": 4.873721045679706e-07, + "loss": 0.0, + "num_tokens": 12557530.0, + "reward": 0.9855979084968567, + "reward_std": 0.6055079698562622, + "rewards/cosine_scaled_reward/mean": 0.04748644679784775, + "rewards/cosine_scaled_reward/std": 0.47108832001686096, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 856.578125, + "completions/mean_terminated_length": 818.1451416015625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.13714285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3406151831150055, + "learning_rate": 4.79604490731896e-07, + "loss": -0.0, + "num_tokens": 12622807.0, + "reward": 0.7979192733764648, + "reward_std": 0.6180044412612915, + "rewards/cosine_scaled_reward/mean": -0.10104038566350937, + "rewards/cosine_scaled_reward/std": 0.44317325949668884, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1764.0, + "completions/mean_length": 726.34375, + "completions/mean_terminated_length": 683.7096557617188, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.1382857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4178949296474457, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.0, + "num_tokens": 12678989.0, + "reward": 1.161607265472412, + "reward_std": 0.6393733024597168, + "rewards/cosine_scaled_reward/mean": 0.08080361783504486, + "rewards/cosine_scaled_reward/std": 0.5313310027122498, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1133.796875, + "completions/mean_terminated_length": 1039.22412109375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.13942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3333284258842468, + "learning_rate": 4.641359520805548e-07, + "loss": 0.0, + "num_tokens": 12763112.0, + "reward": 0.9356573820114136, + "reward_std": 0.6247758269309998, + "rewards/cosine_scaled_reward/mean": -0.02435879409313202, + "rewards/cosine_scaled_reward/std": 0.4759780466556549, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1031.296875, + "completions/mean_terminated_length": 981.2950439453125, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "epoch": 0.14057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29939791560173035, + "learning_rate": 4.5643973913200837e-07, + "loss": -0.0, + "num_tokens": 12839347.0, + "reward": 0.7725162506103516, + "reward_std": 0.5560778379440308, + "rewards/cosine_scaled_reward/mean": -0.09811685979366302, + "rewards/cosine_scaled_reward/std": 0.3822804391384125, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 979.234375, + "completions/mean_terminated_length": 944.758056640625, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.1417142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34992095828056335, + "learning_rate": 4.4877202554526084e-07, + "loss": 0.0, + "num_tokens": 12912970.0, + "reward": 1.085427165031433, + "reward_std": 0.6837464570999146, + "rewards/cosine_scaled_reward/mean": 0.05052608996629715, + "rewards/cosine_scaled_reward/std": 0.4791998267173767, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1076.40625, + "completions/mean_terminated_length": 994.0678100585938, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.14285714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27060386538505554, + "learning_rate": 4.4113514698014953e-07, + "loss": -0.0, + "num_tokens": 12992788.0, + "reward": 1.0397578477859497, + "reward_std": 0.43823006749153137, + "rewards/cosine_scaled_reward/mean": 0.019878946244716644, + "rewards/cosine_scaled_reward/std": 0.46214956045150757, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1856.0, + "completions/mean_length": 1071.53125, + "completions/mean_terminated_length": 1006.4334106445312, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "epoch": 0.144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2776121497154236, + "learning_rate": 4.3353142970386557e-07, + "loss": 0.0, + "num_tokens": 13072662.0, + "reward": 1.0028693675994873, + "reward_std": 0.6879971027374268, + "rewards/cosine_scaled_reward/mean": 0.0014346465468406677, + "rewards/cosine_scaled_reward/std": 0.42488595843315125, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1868.0, + "completions/mean_length": 1180.484375, + "completions/mean_terminated_length": 1056.5535888671875, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.14514285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2829054594039917, + "learning_rate": 4.2596318988235037e-07, + "loss": -0.0, + "num_tokens": 13159309.0, + "reward": 0.6576684713363647, + "reward_std": 0.66895592212677, + "rewards/cosine_scaled_reward/mean": -0.15554077923297882, + "rewards/cosine_scaled_reward/std": 0.3959099054336548, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1869.0, + "completions/mean_length": 1053.328125, + "completions/mean_terminated_length": 950.4310302734375, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.1462857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29738253355026245, + "learning_rate": 4.1843273287476854e-07, + "loss": -0.0, + "num_tokens": 13237074.0, + "reward": 0.8851851224899292, + "reward_std": 0.7390589118003845, + "rewards/cosine_scaled_reward/mean": -0.041782446205616, + "rewards/cosine_scaled_reward/std": 0.46901625394821167, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 1228.484375, + "completions/mean_terminated_length": 1111.4107666015625, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.14742857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25943535566329956, + "learning_rate": 4.1094235253127374e-07, + "loss": -0.0, + "num_tokens": 13326401.0, + "reward": 0.9628820419311523, + "reward_std": 0.6490253210067749, + "rewards/cosine_scaled_reward/mean": 0.004878522828221321, + "rewards/cosine_scaled_reward/std": 0.45456331968307495, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21304203569889069, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 1089.578125, + "completions/mean_terminated_length": 952.6607666015625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.14857142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3009719252586365, + "learning_rate": 4.034943304942796e-07, + "loss": 0.0, + "num_tokens": 13406638.0, + "reward": 0.5984547138214111, + "reward_std": 0.7008002996444702, + "rewards/cosine_scaled_reward/mean": -0.14608514308929443, + "rewards/cosine_scaled_reward/std": 0.37894922494888306, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1651.0, + "completions/mean_length": 1058.03125, + "completions/mean_terminated_length": 916.607177734375, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.14971428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.306725412607193, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0, + "num_tokens": 13484088.0, + "reward": 1.0469268560409546, + "reward_std": 0.6023457050323486, + "rewards/cosine_scaled_reward/mean": 0.0703384131193161, + "rewards/cosine_scaled_reward/std": 0.47298464179039, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1773.0, + "completions/mean_length": 1342.78125, + "completions/mean_terminated_length": 919.6500244140625, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.15085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3032574951648712, + "learning_rate": 3.8873442270461485e-07, + "loss": -0.0, + "num_tokens": 13581090.0, + "reward": 0.4643245339393616, + "reward_std": 0.7533800601959229, + "rewards/cosine_scaled_reward/mean": -0.06471271812915802, + "rewards/cosine_scaled_reward/std": 0.4610835611820221, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.49501484632492065, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 1144.921875, + "completions/mean_terminated_length": 957.4906005859375, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "epoch": 0.152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32285141944885254, + "learning_rate": 3.8142703296283953e-07, + "loss": 0.0, + "num_tokens": 13665589.0, + "reward": 0.5014957189559937, + "reward_std": 0.5352932214736938, + "rewards/cosine_scaled_reward/mean": -0.17112717032432556, + "rewards/cosine_scaled_reward/std": 0.28127768635749817, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 975.53125, + "completions/mean_terminated_length": 958.5079956054688, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.15314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.40716752409935, + "learning_rate": 3.7417099217982686e-07, + "loss": -0.0, + "num_tokens": 13738591.0, + "reward": 1.1759617328643799, + "reward_std": 0.4804629683494568, + "rewards/cosine_scaled_reward/mean": 0.08798093348741531, + "rewards/cosine_scaled_reward/std": 0.5343761444091797, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1686.0, + "completions/max_terminated_length": 1686.0, + "completions/mean_length": 758.515625, + "completions/mean_terminated_length": 758.515625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.15428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42696353793144226, + "learning_rate": 3.6696851061588994e-07, + "loss": -0.0, + "num_tokens": 13797608.0, + "reward": 1.3851683139801025, + "reward_std": 0.5234883427619934, + "rewards/cosine_scaled_reward/mean": 0.19258417189121246, + "rewards/cosine_scaled_reward/std": 0.49346473813056946, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1169.875, + "completions/mean_terminated_length": 1095.4576416015625, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.15542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28027620911598206, + "learning_rate": 3.5982178221668533e-07, + "loss": -0.0, + "num_tokens": 13883152.0, + "reward": 1.0174503326416016, + "reward_std": 0.5889347791671753, + "rewards/cosine_scaled_reward/mean": 0.016537662595510483, + "rewards/cosine_scaled_reward/std": 0.4763922095298767, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1814.0, + "completions/mean_length": 1105.3125, + "completions/mean_terminated_length": 1042.4666748046875, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "epoch": 0.15657142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3002299666404724, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0, + "num_tokens": 13964500.0, + "reward": 0.841381847858429, + "reward_std": 0.6354345083236694, + "rewards/cosine_scaled_reward/mean": -0.07149658352136612, + "rewards/cosine_scaled_reward/std": 0.4138363003730774, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 1125.484375, + "completions/mean_terminated_length": 974.5272216796875, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.15771428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28766506910324097, + "learning_rate": 3.45704275117204e-07, + "loss": -0.0, + "num_tokens": 14047843.0, + "reward": 0.8758631944656372, + "reward_std": 0.7212573289871216, + "rewards/cosine_scaled_reward/mean": -0.05425591766834259, + "rewards/cosine_scaled_reward/std": 0.4783853590488434, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1216.171875, + "completions/mean_terminated_length": 1160.7166748046875, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.15885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2882857024669647, + "learning_rate": 3.387377967463493e-07, + "loss": -0.0, + "num_tokens": 14136318.0, + "reward": 0.7189284563064575, + "reward_std": 0.4593912959098816, + "rewards/cosine_scaled_reward/mean": -0.13272328674793243, + "rewards/cosine_scaled_reward/std": 0.33584704995155334, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1142.140625, + "completions/mean_terminated_length": 1012.732177734375, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.16, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.3000667095184326, + "learning_rate": 3.3183567088914833e-07, + "loss": 0.0, + "num_tokens": 14219639.0, + "reward": 0.8278639316558838, + "reward_std": 0.46724599599838257, + "rewards/cosine_scaled_reward/mean": -0.03919300064444542, + "rewards/cosine_scaled_reward/std": 0.4650508463382721, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1919.0, + "completions/mean_length": 1025.421875, + "completions/mean_terminated_length": 975.131103515625, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.16114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3207882046699524, + "learning_rate": 3.250000000000001e-07, + "loss": 0.0, + "num_tokens": 14295826.0, + "reward": 0.8871637582778931, + "reward_std": 0.6538586616516113, + "rewards/cosine_scaled_reward/mean": -0.04079316183924675, + "rewards/cosine_scaled_reward/std": 0.43451616168022156, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1233.90625, + "completions/mean_terminated_length": 1149.689697265625, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.16228571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3009903132915497, + "learning_rate": 3.182328662904756e-07, + "loss": 0.0, + "num_tokens": 14385300.0, + "reward": 0.8573208451271057, + "reward_std": 0.6099269390106201, + "rewards/cosine_scaled_reward/mean": -0.055714573711156845, + "rewards/cosine_scaled_reward/std": 0.43728360533714294, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 1136.078125, + "completions/mean_terminated_length": 1005.8035888671875, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.16342857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31794917583465576, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0, + "num_tokens": 14468825.0, + "reward": 0.6553314924240112, + "reward_std": 0.6344339847564697, + "rewards/cosine_scaled_reward/mean": -0.11764675378799438, + "rewards/cosine_scaled_reward/std": 0.3099633455276489, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 1220.6875, + "completions/mean_terminated_length": 1029.769287109375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.16457142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3814108967781067, + "learning_rate": 3.0491243424323783e-07, + "loss": 0.0, + "num_tokens": 14558437.0, + "reward": 0.7285318970680237, + "reward_std": 0.8925961256027222, + "rewards/cosine_scaled_reward/mean": -0.05760904401540756, + "rewards/cosine_scaled_reward/std": 0.492266446352005, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1880.0, + "completions/mean_length": 969.796875, + "completions/mean_terminated_length": 916.7704467773438, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.1657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3201180398464203, + "learning_rate": 2.9836319343816397e-07, + "loss": -0.0, + "num_tokens": 14630448.0, + "reward": 0.8149441480636597, + "reward_std": 0.5824600458145142, + "rewards/cosine_scaled_reward/mean": -0.08471541851758957, + "rewards/cosine_scaled_reward/std": 0.475755512714386, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1943.0, + "completions/mean_length": 1034.484375, + "completions/mean_terminated_length": 966.9166870117188, + "completions/min_length": 482.0, + "completions/min_terminated_length": 482.0, + "epoch": 0.16685714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28184273838996887, + "learning_rate": 2.918906036420294e-07, + "loss": -0.0, + "num_tokens": 14707271.0, + "reward": 0.8387603759765625, + "reward_std": 0.5346506237983704, + "rewards/cosine_scaled_reward/mean": -0.07280732691287994, + "rewards/cosine_scaled_reward/std": 0.43024110794067383, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 1249.984375, + "completions/mean_terminated_length": 1046.568603515625, + "completions/min_length": 550.0, + "completions/min_terminated_length": 550.0, + "epoch": 0.168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32145801186561584, + "learning_rate": 2.854966364683872e-07, + "loss": 0.0, + "num_tokens": 14798054.0, + "reward": 0.7505484819412231, + "reward_std": 0.5473448634147644, + "rewards/cosine_scaled_reward/mean": -0.07003828883171082, + "rewards/cosine_scaled_reward/std": 0.4046306014060974, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1844.0, + "completions/mean_length": 1062.828125, + "completions/mean_terminated_length": 960.913818359375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.16914285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2667451500892639, + "learning_rate": 2.791832395815782e-07, + "loss": -0.0, + "num_tokens": 14877259.0, + "reward": 0.7823130488395691, + "reward_std": 0.48230016231536865, + "rewards/cosine_scaled_reward/mean": -0.06978099048137665, + "rewards/cosine_scaled_reward/std": 0.37567150592803955, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1386.875, + "completions/mean_terminated_length": 1086.3636474609375, + "completions/min_length": 439.0, + "completions/min_terminated_length": 439.0, + "epoch": 0.1702857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2730913758277893, + "learning_rate": 2.729523361034538e-07, + "loss": 0.0, + "num_tokens": 14977915.0, + "reward": 0.48214927315711975, + "reward_std": 0.8376681804656982, + "rewards/cosine_scaled_reward/mean": -0.14173786342144012, + "rewards/cosine_scaled_reward/std": 0.4272434711456299, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42695629596710205, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1831.0, + "completions/mean_length": 994.15625, + "completions/mean_terminated_length": 942.3278198242188, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.17142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2946690022945404, + "learning_rate": 2.6680582402757324e-07, + "loss": -0.0, + "num_tokens": 15052045.0, + "reward": 0.8893749713897705, + "reward_std": 0.7130615711212158, + "rewards/cosine_scaled_reward/mean": -0.05531252920627594, + "rewards/cosine_scaled_reward/std": 0.4389563202857971, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1094.4375, + "completions/mean_terminated_length": 917.8518676757812, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.17257142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29988256096839905, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0, + "num_tokens": 15132769.0, + "reward": 1.088501214981079, + "reward_std": 0.9213382005691528, + "rewards/cosine_scaled_reward/mean": 0.10675054788589478, + "rewards/cosine_scaled_reward/std": 0.510394811630249, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 1024.203125, + "completions/mean_terminated_length": 937.440673828125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.1737142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46614158153533936, + "learning_rate": 2.547734369542718e-07, + "loss": -0.0, + "num_tokens": 15208982.0, + "reward": 0.7280048131942749, + "reward_std": 0.706195592880249, + "rewards/cosine_scaled_reward/mean": -0.10474759340286255, + "rewards/cosine_scaled_reward/std": 0.45987388491630554, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24397502839565277, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1955.0, + "completions/mean_length": 1180.234375, + "completions/mean_terminated_length": 1056.2679443359375, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.17485714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33488133549690247, + "learning_rate": 2.488912271385139e-07, + "loss": -0.0, + "num_tokens": 15295661.0, + "reward": 0.4985957443714142, + "reward_std": 0.4677598178386688, + "rewards/cosine_scaled_reward/mean": -0.2272646427154541, + "rewards/cosine_scaled_reward/std": 0.2307518571615219, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21304203569889069, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1340.296875, + "completions/mean_terminated_length": 1142.1400146484375, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "epoch": 0.176, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25304633378982544, + "learning_rate": 2.4310073797187573e-07, + "loss": -0.0, + "num_tokens": 15392504.0, + "reward": 0.7636169195175171, + "reward_std": 0.7114115953445435, + "rewards/cosine_scaled_reward/mean": -0.03225403279066086, + "rewards/cosine_scaled_reward/std": 0.42686402797698975, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38025420904159546, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1909.0, + "completions/mean_length": 915.6875, + "completions/mean_terminated_length": 798.5516967773438, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.17714285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35448068380355835, + "learning_rate": 2.374037332934512e-07, + "loss": 0.0, + "num_tokens": 15461732.0, + "reward": 0.736025333404541, + "reward_std": 0.5466883182525635, + "rewards/cosine_scaled_reward/mean": -0.11636234819889069, + "rewards/cosine_scaled_reward/std": 0.43356192111968994, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1913.0, + "completions/mean_length": 1157.90625, + "completions/mean_terminated_length": 952.5000610351562, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.1782857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4357910454273224, + "learning_rate": 2.3180194846605364e-07, + "loss": -0.0, + "num_tokens": 15545942.0, + "reward": 0.8330824971199036, + "reward_std": 0.725536048412323, + "rewards/cosine_scaled_reward/mean": -0.02095877379179001, + "rewards/cosine_scaled_reward/std": 0.4767586290836334, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1157.75, + "completions/mean_terminated_length": 1030.571533203125, + "completions/min_length": 485.0, + "completions/min_terminated_length": 485.0, + "epoch": 0.17942857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29891225695610046, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.0, + "num_tokens": 15629998.0, + "reward": 0.6674755811691284, + "reward_std": 0.6577311754226685, + "rewards/cosine_scaled_reward/mean": -0.13501222431659698, + "rewards/cosine_scaled_reward/std": 0.36102381348609924, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24397502839565277, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1856.0, + "completions/mean_length": 1013.6875, + "completions/mean_terminated_length": 962.8196411132812, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.18057142857142858, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.2723560333251953, + "learning_rate": 2.2089083427137329e-07, + "loss": 0.0, + "num_tokens": 15704994.0, + "reward": 0.9709224104881287, + "reward_std": 0.48810505867004395, + "rewards/cosine_scaled_reward/mean": -0.014538809657096863, + "rewards/cosine_scaled_reward/std": 0.4970093369483948, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1881.0, + "completions/mean_length": 1081.296875, + "completions/mean_terminated_length": 962.5789794921875, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "epoch": 0.18171428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2894439697265625, + "learning_rate": 2.1558482853517253e-07, + "loss": -0.0, + "num_tokens": 15785877.0, + "reward": 0.5938807725906372, + "reward_std": 0.592242956161499, + "rewards/cosine_scaled_reward/mean": -0.16399714350700378, + "rewards/cosine_scaled_reward/std": 0.3423241078853607, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1750.0, + "completions/mean_length": 968.25, + "completions/mean_terminated_length": 915.1474609375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.18285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3261898159980774, + "learning_rate": 2.1038068889975259e-07, + "loss": 0.0, + "num_tokens": 15859429.0, + "reward": 1.2050117254257202, + "reward_std": 0.6944217681884766, + "rewards/cosine_scaled_reward/mean": 0.10250584781169891, + "rewards/cosine_scaled_reward/std": 0.5283173322677612, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1031.75, + "completions/mean_terminated_length": 945.6271362304688, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34274861216545105, + "learning_rate": 2.0528000059645995e-07, + "loss": -0.0, + "num_tokens": 15935453.0, + "reward": 0.9563960433006287, + "reward_std": 0.6316370964050293, + "rewards/cosine_scaled_reward/mean": 0.009448029100894928, + "rewards/cosine_scaled_reward/std": 0.46292582154273987, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24397502839565277, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1891.0, + "completions/mean_length": 1167.828125, + "completions/mean_terminated_length": 898.3877563476562, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "epoch": 0.18514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3887297511100769, + "learning_rate": 2.0028431734436308e-07, + "loss": 0.0, + "num_tokens": 16020498.0, + "reward": 0.6932262182235718, + "reward_std": 0.8278101682662964, + "rewards/cosine_scaled_reward/mean": -0.08307439833879471, + "rewards/cosine_scaled_reward/std": 0.3847581744194031, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1820.0, + "completions/mean_length": 1058.84375, + "completions/mean_terminated_length": 956.5172119140625, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.18628571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30917680263519287, + "learning_rate": 1.9539516087697517e-07, + "loss": 0.0, + "num_tokens": 16099448.0, + "reward": 1.3529155254364014, + "reward_std": 0.8906396627426147, + "rewards/cosine_scaled_reward/mean": 0.22333277761936188, + "rewards/cosine_scaled_reward/std": 0.5322388410568237, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 988.703125, + "completions/mean_terminated_length": 918.0833740234375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.18742857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33646658062934875, + "learning_rate": 1.9061402047871833e-07, + "loss": 0.0, + "num_tokens": 16173253.0, + "reward": 1.046778678894043, + "reward_std": 0.6892427206039429, + "rewards/cosine_scaled_reward/mean": 0.0390143096446991, + "rewards/cosine_scaled_reward/std": 0.4476637840270996, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1103.5, + "completions/mean_terminated_length": 948.9454345703125, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.18857142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.338925838470459, + "learning_rate": 1.8594235253127372e-07, + "loss": -0.0, + "num_tokens": 16255293.0, + "reward": 0.7887892723083496, + "reward_std": 0.6329070329666138, + "rewards/cosine_scaled_reward/mean": -0.0665428563952446, + "rewards/cosine_scaled_reward/std": 0.4880979061126709, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1714.0, + "completions/mean_length": 1166.265625, + "completions/mean_terminated_length": 1002.9815063476562, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "epoch": 0.18971428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29118841886520386, + "learning_rate": 1.8138158006995363e-07, + "loss": -0.0, + "num_tokens": 16341510.0, + "reward": 0.5021259784698486, + "reward_std": 0.5949545502662659, + "rewards/cosine_scaled_reward/mean": -0.18643701076507568, + "rewards/cosine_scaled_reward/std": 0.3388413190841675, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1756.0, + "completions/mean_length": 1027.96875, + "completions/mean_terminated_length": 922.4482421875, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.19085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3123703598976135, + "learning_rate": 1.7693309235023127e-07, + "loss": -0.0, + "num_tokens": 16418844.0, + "reward": 0.6054480671882629, + "reward_std": 0.6668864488601685, + "rewards/cosine_scaled_reward/mean": -0.17383846640586853, + "rewards/cosine_scaled_reward/std": 0.34976449608802795, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21304203569889069, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1131.890625, + "completions/mean_terminated_length": 1086.8360595703125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2515013515949249, + "learning_rate": 1.7259824442455923e-07, + "loss": 0.0, + "num_tokens": 16502125.0, + "reward": 0.929424524307251, + "reward_std": 0.6242066621780396, + "rewards/cosine_scaled_reward/mean": -0.011850237846374512, + "rewards/cosine_scaled_reward/std": 0.4718935191631317, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21304203569889069, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1737.0, + "completions/mean_length": 908.53125, + "completions/mean_terminated_length": 871.774169921875, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "epoch": 0.19314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29841023683547974, + "learning_rate": 1.6837835672960831e-07, + "loss": -0.0, + "num_tokens": 16570895.0, + "reward": 1.6184587478637695, + "reward_std": 0.5710533857345581, + "rewards/cosine_scaled_reward/mean": 0.3092293441295624, + "rewards/cosine_scaled_reward/std": 0.5226604342460632, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1005.109375, + "completions/mean_terminated_length": 834.4545288085938, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.19428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3179849088191986, + "learning_rate": 1.6427471468404952e-07, + "loss": -0.0, + "num_tokens": 16645006.0, + "reward": 1.0071099996566772, + "reward_std": 0.3746073246002197, + "rewards/cosine_scaled_reward/mean": 0.06605499982833862, + "rewards/cosine_scaled_reward/std": 0.4378518760204315, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1891.0, + "completions/mean_length": 1234.65625, + "completions/mean_terminated_length": 940.4680786132812, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "epoch": 0.19542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2992324233055115, + "learning_rate": 1.6028856829700258e-07, + "loss": -0.0, + "num_tokens": 16734416.0, + "reward": 0.7108581066131592, + "reward_std": 0.7254206538200378, + "rewards/cosine_scaled_reward/mean": -0.02738344669342041, + "rewards/cosine_scaled_reward/std": 0.44080549478530884, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42695629596710205, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1638.0, + "completions/mean_length": 900.234375, + "completions/mean_terminated_length": 823.7167358398438, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.19657142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47149336338043213, + "learning_rate": 1.5642113178727193e-07, + "loss": 0.0, + "num_tokens": 16802647.0, + "reward": 1.3995718955993652, + "reward_std": 0.5902794599533081, + "rewards/cosine_scaled_reward/mean": 0.2310360074043274, + "rewards/cosine_scaled_reward/std": 0.5026565194129944, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24397502839565277, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 925.078125, + "completions/mean_terminated_length": 787.1754150390625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.1977142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3893924057483673, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.0, + "num_tokens": 16873164.0, + "reward": 0.6720038056373596, + "reward_std": 0.667186975479126, + "rewards/cosine_scaled_reward/mean": -0.12493559718132019, + "rewards/cosine_scaled_reward/std": 0.40216636657714844, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1005.578125, + "completions/mean_terminated_length": 971.9515991210938, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.19885714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39529484510421753, + "learning_rate": 1.4904706411523448e-07, + "loss": -0.0, + "num_tokens": 16947857.0, + "reward": 0.9172019958496094, + "reward_std": 0.6198633313179016, + "rewards/cosine_scaled_reward/mean": -0.03358650952577591, + "rewards/cosine_scaled_reward/std": 0.4403606951236725, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1828.0, + "completions/mean_length": 952.296875, + "completions/mean_terminated_length": 898.4097900390625, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.2, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.322712779045105, + "learning_rate": 1.4554267916537495e-07, + "loss": 0.0, + "num_tokens": 17019628.0, + "reward": 0.871549129486084, + "reward_std": 0.46009254455566406, + "rewards/cosine_scaled_reward/mean": -0.05641293525695801, + "rewards/cosine_scaled_reward/std": 0.44415631890296936, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1048.453125, + "completions/mean_terminated_length": 945.0516967773438, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.20114285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3287680447101593, + "learning_rate": 1.4216149583350755e-07, + "loss": -0.0, + "num_tokens": 17097897.0, + "reward": 0.839117705821991, + "reward_std": 0.7753168344497681, + "rewards/cosine_scaled_reward/mean": -0.04137861356139183, + "rewards/cosine_scaled_reward/std": 0.43453913927078247, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1915.0, + "completions/mean_length": 968.34375, + "completions/mean_terminated_length": 933.51611328125, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "epoch": 0.2022857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3266870677471161, + "learning_rate": 1.3890454406082956e-07, + "loss": -0.0, + "num_tokens": 17170095.0, + "reward": 1.0329997539520264, + "reward_std": 0.7290528416633606, + "rewards/cosine_scaled_reward/mean": 0.024312350898981094, + "rewards/cosine_scaled_reward/std": 0.46764034032821655, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1016.0625, + "completions/mean_terminated_length": 909.3103637695312, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.20342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.330020546913147, + "learning_rate": 1.3577281594640182e-07, + "loss": -0.0, + "num_tokens": 17246659.0, + "reward": 1.1118203401565552, + "reward_std": 0.7913287878036499, + "rewards/cosine_scaled_reward/mean": 0.07934767752885818, + "rewards/cosine_scaled_reward/std": 0.5148099660873413, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21304203569889069, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1945.0, + "completions/mean_length": 1227.78125, + "completions/mean_terminated_length": 976.69384765625, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.20457142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33415722846984863, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.0, + "num_tokens": 17336069.0, + "reward": 0.608305037021637, + "reward_std": 0.5569274425506592, + "rewards/cosine_scaled_reward/mean": -0.10991000384092331, + "rewards/cosine_scaled_reward/std": 0.3418741822242737, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38025420904159546, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1932.0, + "completions/mean_length": 1024.46875, + "completions/mean_terminated_length": 956.2333984375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.2057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3402194082736969, + "learning_rate": 1.2988880807625927e-07, + "loss": -0.0, + "num_tokens": 17412811.0, + "reward": 1.6137604713439941, + "reward_std": 0.8008866310119629, + "rewards/cosine_scaled_reward/mean": 0.31469273567199707, + "rewards/cosine_scaled_reward/std": 0.5089212656021118, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1157.546875, + "completions/mean_terminated_length": 992.6481323242188, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "epoch": 0.20685714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29356250166893005, + "learning_rate": 1.2713832064634125e-07, + "loss": -0.0, + "num_tokens": 17498366.0, + "reward": 0.7507010698318481, + "reward_std": 0.5088521242141724, + "rewards/cosine_scaled_reward/mean": -0.07777446508407593, + "rewards/cosine_scaled_reward/std": 0.4100310504436493, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1840.0, + "completions/mean_length": 1166.390625, + "completions/mean_terminated_length": 896.5101928710938, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.208, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2653217613697052, + "learning_rate": 1.2451664098030743e-07, + "loss": -0.0, + "num_tokens": 17582807.0, + "reward": 0.7447050213813782, + "reward_std": 0.8267481327056885, + "rewards/cosine_scaled_reward/mean": -0.04170997440814972, + "rewards/cosine_scaled_reward/std": 0.4390917420387268, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38025420904159546, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1843.0, + "completions/mean_length": 1003.203125, + "completions/mean_terminated_length": 933.550048828125, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.20914285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3268946707248688, + "learning_rate": 1.220245676671809e-07, + "loss": 0.0, + "num_tokens": 17657628.0, + "reward": 1.0635898113250732, + "reward_std": 0.5967966914176941, + "rewards/cosine_scaled_reward/mean": 0.039607420563697815, + "rewards/cosine_scaled_reward/std": 0.43730178475379944, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1941.0, + "completions/mean_length": 1025.171875, + "completions/mean_terminated_length": 938.4915161132812, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.2102857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36050307750701904, + "learning_rate": 1.1966285981663407e-07, + "loss": 0.0, + "num_tokens": 17734591.0, + "reward": 0.6448719501495361, + "reward_std": 0.503462553024292, + "rewards/cosine_scaled_reward/mean": -0.14631402492523193, + "rewards/cosine_scaled_reward/std": 0.3733954429626465, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24397502839565277, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1779.0, + "completions/mean_length": 969.015625, + "completions/mean_terminated_length": 934.2096557617188, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.21142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42219310998916626, + "learning_rate": 1.1743223682775649e-07, + "loss": -0.0, + "num_tokens": 17806792.0, + "reward": 0.7470877766609192, + "reward_std": 0.5973426103591919, + "rewards/cosine_scaled_reward/mean": -0.11864358186721802, + "rewards/cosine_scaled_reward/std": 0.41184645891189575, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1667.0, + "completions/mean_length": 1076.984375, + "completions/mean_terminated_length": 938.2678833007812, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.21257142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30614498257637024, + "learning_rate": 1.1533337816991931e-07, + "loss": -0.0, + "num_tokens": 17886415.0, + "reward": 0.804481029510498, + "reward_std": 0.4629480838775635, + "rewards/cosine_scaled_reward/mean": -0.03525950014591217, + "rewards/cosine_scaled_reward/std": 0.45060867071151733, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1879.0, + "completions/mean_length": 1112.1875, + "completions/mean_terminated_length": 1049.800048828125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.21371428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4846937656402588, + "learning_rate": 1.1336692317580158e-07, + "loss": 0.0, + "num_tokens": 17968019.0, + "reward": 0.6981200575828552, + "reward_std": 0.53022301197052, + "rewards/cosine_scaled_reward/mean": -0.1275024712085724, + "rewards/cosine_scaled_reward/std": 0.38560083508491516, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21304203569889069, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1910.0, + "completions/mean_length": 1079.90625, + "completions/mean_terminated_length": 997.8643798828125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.21485714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34140780568122864, + "learning_rate": 1.1153347084664419e-07, + "loss": -0.0, + "num_tokens": 18048933.0, + "reward": 0.5326423645019531, + "reward_std": 0.5487440824508667, + "rewards/cosine_scaled_reward/mean": -0.22586631774902344, + "rewards/cosine_scaled_reward/std": 0.3085760772228241, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 868.546875, + "completions/mean_terminated_length": 830.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6607878804206848, + "learning_rate": 1.0983357966978745e-07, + "loss": -0.0, + "num_tokens": 18113808.0, + "reward": 0.7490335702896118, + "reward_std": 0.6654466390609741, + "rewards/cosine_scaled_reward/mean": -0.11767073720693588, + "rewards/cosine_scaled_reward/std": 0.4015049338340759, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1713.0, + "completions/mean_length": 938.8125, + "completions/mean_terminated_length": 903.0322265625, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.21714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3069080710411072, + "learning_rate": 1.0826776744855121e-07, + "loss": -0.0, + "num_tokens": 18183660.0, + "reward": 0.9838922023773193, + "reward_std": 0.5085676908493042, + "rewards/cosine_scaled_reward/mean": -0.00024138391017913818, + "rewards/cosine_scaled_reward/std": 0.44459760189056396, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1851.0, + "completions/max_terminated_length": 1851.0, + "completions/mean_length": 902.453125, + "completions/mean_terminated_length": 902.453125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.21828571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35081905126571655, + "learning_rate": 1.068365111445064e-07, + "loss": 0.0, + "num_tokens": 18251705.0, + "reward": 1.247175931930542, + "reward_std": 0.8716963529586792, + "rewards/cosine_scaled_reward/mean": 0.13140051066875458, + "rewards/cosine_scaled_reward/std": 0.5292099118232727, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1270.0, + "completions/mean_terminated_length": 1052.1600341796875, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "epoch": 0.21942857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2882588505744934, + "learning_rate": 1.0554024673218806e-07, + "loss": -0.0, + "num_tokens": 18344281.0, + "reward": 0.5913476943969727, + "reward_std": 0.6203497052192688, + "rewards/cosine_scaled_reward/mean": -0.11057613790035248, + "rewards/cosine_scaled_reward/std": 0.33690571784973145, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.39339789748191833, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1806.0, + "completions/mean_length": 1134.3125, + "completions/mean_terminated_length": 1022.1052856445312, + "completions/min_length": 475.0, + "completions/min_terminated_length": 475.0, + "epoch": 0.22057142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30495956540107727, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.0, + "num_tokens": 18428021.0, + "reward": 0.9724597930908203, + "reward_std": 0.6338238716125488, + "rewards/cosine_scaled_reward/mean": 0.025292381644248962, + "rewards/cosine_scaled_reward/std": 0.47308972477912903, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1284.34375, + "completions/mean_terminated_length": 1050.5714111328125, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.22171428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29666370153427124, + "learning_rate": 1.0335423176140511e-07, + "loss": -0.0, + "num_tokens": 18521579.0, + "reward": 0.970361590385437, + "reward_std": 0.8541973829269409, + "rewards/cosine_scaled_reward/mean": 0.055493295192718506, + "rewards/cosine_scaled_reward/std": 0.5139825344085693, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1614.0, + "completions/mean_length": 1111.140625, + "completions/mean_terminated_length": 957.8363037109375, + "completions/min_length": 612.0, + "completions/min_terminated_length": 612.0, + "epoch": 0.22285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2935192883014679, + "learning_rate": 1.0246514708427701e-07, + "loss": -0.0, + "num_tokens": 18603836.0, + "reward": 0.9238024353981018, + "reward_std": 0.7688024044036865, + "rewards/cosine_scaled_reward/mean": 0.008776212111115456, + "rewards/cosine_scaled_reward/std": 0.4346567392349243, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1913.0, + "completions/mean_length": 1110.28125, + "completions/mean_terminated_length": 1064.163818359375, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "epoch": 0.224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31850409507751465, + "learning_rate": 1.017123858587145e-07, + "loss": 0.0, + "num_tokens": 18686486.0, + "reward": 1.0064561367034912, + "reward_std": 0.6142268776893616, + "rewards/cosine_scaled_reward/mean": 0.0032280460000038147, + "rewards/cosine_scaled_reward/std": 0.4689313769340515, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1862.0, + "completions/max_terminated_length": 1862.0, + "completions/mean_length": 867.390625, + "completions/mean_terminated_length": 867.390625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.22514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36897119879722595, + "learning_rate": 1.0109617738307911e-07, + "loss": 0.0, + "num_tokens": 18752367.0, + "reward": 1.2200298309326172, + "reward_std": 0.7840542197227478, + "rewards/cosine_scaled_reward/mean": 0.11001493036746979, + "rewards/cosine_scaled_reward/std": 0.5105303525924683, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1018.171875, + "completions/mean_terminated_length": 911.637939453125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.22628571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33654487133026123, + "learning_rate": 1.0061670936044178e-07, + "loss": 0.0, + "num_tokens": 18829034.0, + "reward": 1.0653846263885498, + "reward_std": 0.7624523043632507, + "rewards/cosine_scaled_reward/mean": 0.04831730201840401, + "rewards/cosine_scaled_reward/std": 0.4961619973182678, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1170.84375, + "completions/mean_terminated_length": 1096.5084228515625, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "epoch": 0.22742857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28278952836990356, + "learning_rate": 1.002741278414069e-07, + "loss": 0.0, + "num_tokens": 18915472.0, + "reward": 0.6831471920013428, + "reward_std": 0.6951984167098999, + "rewards/cosine_scaled_reward/mean": -0.1506139189004898, + "rewards/cosine_scaled_reward/std": 0.34608688950538635, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1764.0, + "completions/mean_length": 999.390625, + "completions/mean_terminated_length": 849.5892944335938, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "epoch": 0.22857142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28817513585090637, + "learning_rate": 1.0006853717962393e-07, + "loss": 0.0, + "num_tokens": 18989553.0, + "reward": 0.9030377864837646, + "reward_std": 0.8171917200088501, + "rewards/cosine_scaled_reward/mean": -0.01723114401102066, + "rewards/cosine_scaled_reward/std": 0.4829805791378021, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24397502839565277, + "step": 200 + } + ], + "logging_steps": 1, + "max_steps": 200, + "num_input_tokens_seen": 18989553, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin new file mode 100644 index 0000000..9e03ee7 --- /dev/null +++ b/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec4fbe7e29ae78abab9c9fd5555870c1bffe7656ceef4ac5fa2148a15b61b1e3 +size 8888 diff --git a/checkpoint-200/zero_to_fp32.py b/checkpoint-200/zero_to_fp32.py new file mode 100644 index 0000000..0e75914 --- /dev/null +++ b/checkpoint-200/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-50/config.json b/checkpoint-50/config.json new file mode 100644 index 0000000..78fed5b --- /dev/null +++ b/checkpoint-50/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": 4096, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "use_cache": false, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/checkpoint-50/generation_config.json b/checkpoint-50/generation_config.json new file mode 100644 index 0000000..92878bd --- /dev/null +++ b/checkpoint-50/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 151646, + "do_sample": true, + "eos_token_id": 151643, + "temperature": 0.6, + "top_p": 0.95, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-50/global_step50/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-50/global_step50/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..49a88ee --- /dev/null +++ b/checkpoint-50/global_step50/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15b6a9782b696b9648759e23b9a015a229cf06021f44496888b5b974982b73d0 +size 5331274140 diff --git a/checkpoint-50/global_step50/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-50/global_step50/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..28ad062 --- /dev/null +++ b/checkpoint-50/global_step50/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f8ce47abef1a09396a2597f93a6dab8419eb47fb54f07cef9745062ef5e6150 +size 5331276572 diff --git a/checkpoint-50/global_step50/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-50/global_step50/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..a564453 --- /dev/null +++ b/checkpoint-50/global_step50/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:999e75772985222df760220d28f2111b606d3fb2b1158595ae4c2a3dd6426a9b +size 5331276892 diff --git a/checkpoint-50/global_step50/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-50/global_step50/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000..245973f --- /dev/null +++ b/checkpoint-50/global_step50/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a709181aaa3478b82261661698e5e2ae9ccd8b5a5eaa4d6872be8b7346ba0fa5 +size 5331273884 diff --git a/checkpoint-50/global_step50/mp_rank_00_model_states.pt b/checkpoint-50/global_step50/mp_rank_00_model_states.pt new file mode 100644 index 0000000..0f58570 --- /dev/null +++ b/checkpoint-50/global_step50/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36c107850937f54f3ab44e97309ceb540caccbb4f6dd4fb455172a625fea33b7 +size 3554267640 diff --git a/checkpoint-50/latest b/checkpoint-50/latest new file mode 100644 index 0000000..9b4dc80 --- /dev/null +++ b/checkpoint-50/latest @@ -0,0 +1 @@ +global_step50 \ No newline at end of file diff --git a/checkpoint-50/model.safetensors b/checkpoint-50/model.safetensors new file mode 100644 index 0000000..ee7ca9a --- /dev/null +++ b/checkpoint-50/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afa546fe36442ef1eadd1abe75c4e1cc7f12d3eb73207863f1c8275e5e09d219 +size 3554214752 diff --git a/checkpoint-50/rng_state_0.pth b/checkpoint-50/rng_state_0.pth new file mode 100644 index 0000000..53282b7 --- /dev/null +++ b/checkpoint-50/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c144d7042faa07cd1b3e09bc6db4e37259092c2146f9f694ec2741bf55635f63 +size 14960 diff --git a/checkpoint-50/rng_state_1.pth b/checkpoint-50/rng_state_1.pth new file mode 100644 index 0000000..5ec9370 --- /dev/null +++ b/checkpoint-50/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f8ad4f83876ffc17dd3af676c2318f13a70ea40dda9ffa3802ec5bc5ad03eb3 +size 14960 diff --git a/checkpoint-50/rng_state_2.pth b/checkpoint-50/rng_state_2.pth new file mode 100644 index 0000000..77cdca5 --- /dev/null +++ b/checkpoint-50/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9b346a73b0296c1aab4bbcd1a7c35830287f75f12d6c38a95f96663bc882c00 +size 14960 diff --git a/checkpoint-50/rng_state_3.pth b/checkpoint-50/rng_state_3.pth new file mode 100644 index 0000000..3ea893c --- /dev/null +++ b/checkpoint-50/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0276cfb938f62268a0976c5009fbeeac22aa3317b639d8877e741f6279007a62 +size 14960 diff --git a/checkpoint-50/scheduler.pt b/checkpoint-50/scheduler.pt new file mode 100644 index 0000000..be8f09d --- /dev/null +++ b/checkpoint-50/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0208179da2b605778b21720a99ccfb3d5e515115ee90824c90bfcabf8ad99120 +size 1064 diff --git a/checkpoint-50/special_tokens_map.json b/checkpoint-50/special_tokens_map.json new file mode 100644 index 0000000..1d385d6 --- /dev/null +++ b/checkpoint-50/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-50/tokenizer.json b/checkpoint-50/tokenizer.json new file mode 100644 index 0000000..e7cd2c1 --- /dev/null +++ b/checkpoint-50/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a +size 11422959 diff --git a/checkpoint-50/tokenizer_config.json b/checkpoint-50/tokenizer_config.json new file mode 100644 index 0000000..ef6e98c --- /dev/null +++ b/checkpoint-50/tokenizer_config.json @@ -0,0 +1,195 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/checkpoint-50/trainer_state.json b/checkpoint-50/trainer_state.json new file mode 100644 index 0000000..6678a05 --- /dev/null +++ b/checkpoint-50/trainer_state.json @@ -0,0 +1,1384 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.05714285714285714, + "eval_steps": 500, + "global_step": 50, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1734.0, + "completions/mean_length": 1702.03125, + "completions/mean_terminated_length": 993.6190795898438, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.001142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2544386684894562, + "learning_rate": 0.0, + "loss": -0.0, + "num_tokens": 118418.0, + "reward": 0.17899775505065918, + "reward_std": 0.7650213241577148, + "rewards/cosine_scaled_reward/mean": -0.09800112992525101, + "rewards/cosine_scaled_reward/std": 0.37953105568885803, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.48795005679130554, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1894.0, + "completions/mean_length": 1738.90625, + "completions/mean_terminated_length": 949.0, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.002285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2436082512140274, + "learning_rate": 5e-08, + "loss": -0.0, + "num_tokens": 239748.0, + "reward": 0.3848632574081421, + "reward_std": 0.9111153483390808, + "rewards/cosine_scaled_reward/mean": 0.020556632429361343, + "rewards/cosine_scaled_reward/std": 0.4492928683757782, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1545.0, + "completions/mean_length": 1989.015625, + "completions/mean_terminated_length": 1104.25, + "completions/min_length": 706.0, + "completions/min_terminated_length": 706.0, + "epoch": 0.0034285714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2544717788696289, + "learning_rate": 1e-07, + "loss": -0.0, + "num_tokens": 377517.0, + "reward": -0.3279358148574829, + "reward_std": 0.33216947317123413, + "rewards/cosine_scaled_reward/mean": -0.20303040742874146, + "rewards/cosine_scaled_reward/std": 0.179075226187706, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.27048972249031067, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1566.421875, + "completions/mean_terminated_length": 1084.84375, + "completions/min_length": 502.0, + "completions/min_terminated_length": 502.0, + "epoch": 0.004571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28807103633880615, + "learning_rate": 1.5e-07, + "loss": -0.0, + "num_tokens": 487576.0, + "reward": 0.2716121971607208, + "reward_std": 0.6643469333648682, + "rewards/cosine_scaled_reward/mean": -0.12981891632080078, + "rewards/cosine_scaled_reward/std": 0.3019586503505707, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.5029674172401428, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1807.0, + "completions/mean_length": 1936.84375, + "completions/mean_terminated_length": 1031.71435546875, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "epoch": 0.005714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26783761382102966, + "learning_rate": 2e-07, + "loss": -0.0, + "num_tokens": 622350.0, + "reward": -0.3612896800041199, + "reward_std": 0.41048353910446167, + "rewards/cosine_scaled_reward/mean": -0.23533234000205994, + "rewards/cosine_scaled_reward/std": 0.20467400550842285, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.3145764470100403, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1301.0, + "completions/mean_length": 1889.453125, + "completions/mean_terminated_length": 779.625, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "epoch": 0.006857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.262518972158432, + "learning_rate": 2.5e-07, + "loss": 0.0, + "num_tokens": 754923.0, + "reward": -0.29250282049179077, + "reward_std": 0.5422531962394714, + "rewards/cosine_scaled_reward/mean": -0.22437641024589539, + "rewards/cosine_scaled_reward/std": 0.22509199380874634, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36596253514289856, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1568.0, + "completions/mean_length": 1921.921875, + "completions/mean_terminated_length": 1314.45458984375, + "completions/min_length": 927.0, + "completions/min_terminated_length": 927.0, + "epoch": 0.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22601397335529327, + "learning_rate": 3e-07, + "loss": 0.0, + "num_tokens": 888334.0, + "reward": 0.025340259075164795, + "reward_std": 0.7285393476486206, + "rewards/cosine_scaled_reward/mean": -0.1279548704624176, + "rewards/cosine_scaled_reward/std": 0.40222346782684326, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1736.859375, + "completions/mean_terminated_length": 999.9473876953125, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.009142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24552854895591736, + "learning_rate": 3.5e-07, + "loss": 0.0, + "num_tokens": 1009909.0, + "reward": 0.21729671955108643, + "reward_std": 0.6989120244979858, + "rewards/cosine_scaled_reward/mean": -0.055414143949747086, + "rewards/cosine_scaled_reward/std": 0.47493892908096313, + "rewards/format_reward/mean": 0.328125, + "rewards/format_reward/std": 0.4732423722743988, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 1967.53125, + "completions/mean_terminated_length": 1475.77783203125, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "epoch": 0.010285714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2430322915315628, + "learning_rate": 4e-07, + "loss": 0.0, + "num_tokens": 1147287.0, + "reward": -0.21451422572135925, + "reward_std": 0.587526798248291, + "rewards/cosine_scaled_reward/mean": -0.19319462776184082, + "rewards/cosine_scaled_reward/std": 0.29357606172561646, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.38025420904159546, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 1708.546875, + "completions/mean_terminated_length": 961.75, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.011428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2543582320213318, + "learning_rate": 4.5e-07, + "loss": 0.0, + "num_tokens": 1267466.0, + "reward": 0.02539752423763275, + "reward_std": 0.545810341835022, + "rewards/cosine_scaled_reward/mean": -0.14355123043060303, + "rewards/cosine_scaled_reward/std": 0.36147356033325195, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.467176616191864, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1579.0, + "completions/mean_length": 1967.734375, + "completions/mean_terminated_length": 1191.8333740234375, + "completions/min_length": 843.0, + "completions/min_terminated_length": 843.0, + "epoch": 0.012571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24583907425403595, + "learning_rate": 5e-07, + "loss": -0.0, + "num_tokens": 1405073.0, + "reward": -0.46971434354782104, + "reward_std": 0.36104393005371094, + "rewards/cosine_scaled_reward/mean": -0.28173214197158813, + "rewards/cosine_scaled_reward/std": 0.17775526642799377, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29378482699394226, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1862.0, + "completions/mean_length": 1707.5625, + "completions/mean_terminated_length": 1176.47998046875, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.013714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3135142922401428, + "learning_rate": 5.5e-07, + "loss": -0.0, + "num_tokens": 1525301.0, + "reward": 0.0018395520746707916, + "reward_std": 0.7012988328933716, + "rewards/cosine_scaled_reward/mean": -0.21783021092414856, + "rewards/cosine_scaled_reward/std": 0.324150949716568, + "rewards/format_reward/mean": 0.4375, + "rewards/format_reward/std": 0.5, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1745.0, + "completions/mean_length": 1841.96875, + "completions/mean_terminated_length": 1168.933349609375, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "epoch": 0.014857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2532394826412201, + "learning_rate": 6e-07, + "loss": -0.0, + "num_tokens": 1654227.0, + "reward": -0.10322706401348114, + "reward_std": 0.6915165185928345, + "rewards/cosine_scaled_reward/mean": -0.17661353945732117, + "rewards/cosine_scaled_reward/std": 0.329875111579895, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1816.390625, + "completions/mean_terminated_length": 1306.8499755859375, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "epoch": 0.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28405147790908813, + "learning_rate": 6.5e-07, + "loss": 0.0, + "num_tokens": 1781084.0, + "reward": 0.10602855682373047, + "reward_std": 0.630502462387085, + "rewards/cosine_scaled_reward/mean": -0.11104822158813477, + "rewards/cosine_scaled_reward/std": 0.3846627473831177, + "rewards/format_reward/mean": 0.328125, + "rewards/format_reward/std": 0.4732423722743988, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1574.0, + "completions/mean_length": 1702.109375, + "completions/mean_terminated_length": 818.1666870117188, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.017142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28779250383377075, + "learning_rate": 7e-07, + "loss": 0.0, + "num_tokens": 1900939.0, + "reward": 0.32734519243240356, + "reward_std": 0.3870265483856201, + "rewards/cosine_scaled_reward/mean": 0.007422588765621185, + "rewards/cosine_scaled_reward/std": 0.45787373185157776, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.467176616191864, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.018285714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2337152510881424, + "learning_rate": 7.5e-07, + "loss": -0.0, + "num_tokens": 2042451.0, + "reward": -0.5429925918579102, + "reward_std": 0.3153150975704193, + "rewards/cosine_scaled_reward/mean": -0.2714962661266327, + "rewards/cosine_scaled_reward/std": 0.1678173691034317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1879.0, + "completions/mean_length": 1564.921875, + "completions/mean_terminated_length": 858.8846435546875, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.019428571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33599403500556946, + "learning_rate": 8e-07, + "loss": -0.0, + "num_tokens": 2153126.0, + "reward": 0.17696775496006012, + "reward_std": 0.6489306688308716, + "rewards/cosine_scaled_reward/mean": -0.11464111506938934, + "rewards/cosine_scaled_reward/std": 0.3551919758319855, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.49501484632492065, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1298.0, + "completions/mean_length": 1795.390625, + "completions/mean_terminated_length": 893.21435546875, + "completions/min_length": 619.0, + "completions/min_terminated_length": 619.0, + "epoch": 0.02057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22697053849697113, + "learning_rate": 8.499999999999999e-07, + "loss": -0.0, + "num_tokens": 2278407.0, + "reward": -0.10711958259344101, + "reward_std": 0.5238703489303589, + "rewards/cosine_scaled_reward/mean": -0.1785597801208496, + "rewards/cosine_scaled_reward/std": 0.2545098662376404, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 1921.484375, + "completions/mean_terminated_length": 1238.300048828125, + "completions/min_length": 623.0, + "completions/min_terminated_length": 623.0, + "epoch": 0.021714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23972108960151672, + "learning_rate": 9e-07, + "loss": 0.0, + "num_tokens": 2412638.0, + "reward": 0.029344379901885986, + "reward_std": 0.6719281077384949, + "rewards/cosine_scaled_reward/mean": -0.086890310049057, + "rewards/cosine_scaled_reward/std": 0.40220555663108826, + "rewards/format_reward/mean": 0.203125, + "rewards/format_reward/std": 0.40550529956817627, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1728.5625, + "completions/mean_terminated_length": 845.4117431640625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.022857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23309311270713806, + "learning_rate": 9.499999999999999e-07, + "loss": 0.0, + "num_tokens": 2534618.0, + "reward": 0.0131673663854599, + "reward_std": 0.4436222314834595, + "rewards/cosine_scaled_reward/mean": -0.13404130935668945, + "rewards/cosine_scaled_reward/std": 0.32819250226020813, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 1777.953125, + "completions/mean_terminated_length": 1087.8333740234375, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29990270733833313, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 2659215.0, + "reward": -0.1764472872018814, + "reward_std": 0.5121938586235046, + "rewards/cosine_scaled_reward/mean": -0.2444736361503601, + "rewards/cosine_scaled_reward/std": 0.289971262216568, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.467176616191864, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 1361.28125, + "completions/mean_terminated_length": 921.0769653320312, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.025142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29922786355018616, + "learning_rate": 9.99931462820376e-07, + "loss": -0.0, + "num_tokens": 2755353.0, + "reward": 0.6089149713516235, + "reward_std": 0.5986809730529785, + "rewards/cosine_scaled_reward/mean": -0.05491749942302704, + "rewards/cosine_scaled_reward/std": 0.39076483249664307, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4531635046005249, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1565.046875, + "completions/mean_terminated_length": 903.2222290039062, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.026285714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27512773871421814, + "learning_rate": 9.997258721585931e-07, + "loss": -0.0, + "num_tokens": 2866308.0, + "reward": 0.21871733665466309, + "reward_std": 0.5976030826568604, + "rewards/cosine_scaled_reward/mean": -0.10157884657382965, + "rewards/cosine_scaled_reward/std": 0.3856185972690582, + "rewards/format_reward/mean": 0.421875, + "rewards/format_reward/std": 0.49776285886764526, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 1801.671875, + "completions/mean_terminated_length": 1259.75, + "completions/min_length": 573.0, + "completions/min_terminated_length": 573.0, + "epoch": 0.027428571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22642865777015686, + "learning_rate": 9.993832906395582e-07, + "loss": -0.0, + "num_tokens": 2992543.0, + "reward": 0.04899948835372925, + "reward_std": 0.8525694608688354, + "rewards/cosine_scaled_reward/mean": -0.17081275582313538, + "rewards/cosine_scaled_reward/std": 0.3993513882160187, + "rewards/format_reward/mean": 0.390625, + "rewards/format_reward/std": 0.4917473793029785, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1715.765625, + "completions/mean_terminated_length": 1035.4761962890625, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.02857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25316134095191956, + "learning_rate": 9.989038226169207e-07, + "loss": -0.0, + "num_tokens": 3112648.0, + "reward": 0.10585837811231613, + "reward_std": 0.7828943729400635, + "rewards/cosine_scaled_reward/mean": -0.11894579976797104, + "rewards/cosine_scaled_reward/std": 0.4141720235347748, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 1917.703125, + "completions/mean_terminated_length": 1452.357177734375, + "completions/min_length": 840.0, + "completions/min_terminated_length": 840.0, + "epoch": 0.029714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2521306574344635, + "learning_rate": 9.982876141412855e-07, + "loss": -0.0, + "num_tokens": 3246013.0, + "reward": 0.17620250582695007, + "reward_std": 0.6548349857330322, + "rewards/cosine_scaled_reward/mean": -0.08377375453710556, + "rewards/cosine_scaled_reward/std": 0.3527655303478241, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1851.015625, + "completions/mean_terminated_length": 1147.5, + "completions/min_length": 506.0, + "completions/min_terminated_length": 506.0, + "epoch": 0.030857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2730060815811157, + "learning_rate": 9.975348529157229e-07, + "loss": -0.0, + "num_tokens": 3374766.0, + "reward": -0.18854813277721405, + "reward_std": 0.49348777532577515, + "rewards/cosine_scaled_reward/mean": -0.21146157383918762, + "rewards/cosine_scaled_reward/std": 0.2601618766784668, + "rewards/format_reward/mean": 0.234375, + "rewards/format_reward/std": 0.42695629596710205, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1798.328125, + "completions/mean_terminated_length": 1049.3125, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2566036880016327, + "learning_rate": 9.96645768238595e-07, + "loss": 0.0, + "num_tokens": 3500195.0, + "reward": 0.06705980002880096, + "reward_std": 0.7090284824371338, + "rewards/cosine_scaled_reward/mean": -0.10709509253501892, + "rewards/cosine_scaled_reward/std": 0.4101051986217499, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1930.203125, + "completions/mean_terminated_length": 1210.3333740234375, + "completions/min_length": 582.0, + "completions/min_terminated_length": 582.0, + "epoch": 0.03314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25197461247444153, + "learning_rate": 9.956206309337066e-07, + "loss": 0.0, + "num_tokens": 3634200.0, + "reward": -0.2462695688009262, + "reward_std": 0.5237302780151367, + "rewards/cosine_scaled_reward/mean": -0.2012597918510437, + "rewards/cosine_scaled_reward/std": 0.23252712190151215, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36596253514289856, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1900.0, + "completions/mean_length": 1847.65625, + "completions/mean_terminated_length": 1061.6923828125, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "epoch": 0.03428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30431485176086426, + "learning_rate": 9.944597532678119e-07, + "loss": 0.0, + "num_tokens": 3762986.0, + "reward": -0.05392302945256233, + "reward_std": 0.7249555587768555, + "rewards/cosine_scaled_reward/mean": -0.15196150541305542, + "rewards/cosine_scaled_reward/std": 0.34566983580589294, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1860.0, + "completions/mean_length": 1838.671875, + "completions/mean_terminated_length": 931.5833740234375, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.03542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2484513372182846, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0, + "num_tokens": 3891157.0, + "reward": -0.11271396279335022, + "reward_std": 0.6705260872840881, + "rewards/cosine_scaled_reward/mean": -0.1813569962978363, + "rewards/cosine_scaled_reward/std": 0.4071698486804962, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1715.0, + "completions/mean_length": 1910.109375, + "completions/mean_terminated_length": 1417.6429443359375, + "completions/min_length": 906.0, + "completions/min_terminated_length": 906.0, + "epoch": 0.036571428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25329527258872986, + "learning_rate": 9.917322325514487e-07, + "loss": -0.0, + "num_tokens": 4023756.0, + "reward": -0.08931556344032288, + "reward_std": 0.6381070613861084, + "rewards/cosine_scaled_reward/mean": -0.16965776681900024, + "rewards/cosine_scaled_reward/std": 0.37385129928588867, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1865.0, + "completions/mean_length": 2023.71875, + "completions/mean_terminated_length": 1530.0, + "completions/min_length": 1107.0, + "completions/min_terminated_length": 1107.0, + "epoch": 0.037714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22758109867572784, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0, + "num_tokens": 4164490.0, + "reward": -0.4589868187904358, + "reward_std": 0.5177067518234253, + "rewards/cosine_scaled_reward/mean": -0.2919934093952179, + "rewards/cosine_scaled_reward/std": 0.2252870500087738, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3333333432674408, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1454.78125, + "completions/mean_terminated_length": 963.2571411132812, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.038857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3234354257583618, + "learning_rate": 9.88466529153356e-07, + "loss": 0.0, + "num_tokens": 4267148.0, + "reward": 0.656031608581543, + "reward_std": 0.7529654502868652, + "rewards/cosine_scaled_reward/mean": 0.05457830801606178, + "rewards/cosine_scaled_reward/std": 0.49684229493141174, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.501733124256134, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1724.0, + "completions/mean_length": 1819.078125, + "completions/mean_terminated_length": 716.0909423828125, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "epoch": 0.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2821458876132965, + "learning_rate": 9.866330768241983e-07, + "loss": -0.0, + "num_tokens": 4395065.0, + "reward": -0.09630556404590607, + "reward_std": 0.7089139223098755, + "rewards/cosine_scaled_reward/mean": -0.15752778947353363, + "rewards/cosine_scaled_reward/std": 0.3647947609424591, + "rewards/format_reward/mean": 0.21875, + "rewards/format_reward/std": 0.4166666865348816, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1811.0, + "completions/mean_length": 1954.34375, + "completions/mean_terminated_length": 1382.0, + "completions/min_length": 949.0, + "completions/min_terminated_length": 949.0, + "epoch": 0.04114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24163897335529327, + "learning_rate": 9.846666218300807e-07, + "loss": -0.0, + "num_tokens": 4531255.0, + "reward": -0.34593287110328674, + "reward_std": 0.44493502378463745, + "rewards/cosine_scaled_reward/mean": -0.24327893555164337, + "rewards/cosine_scaled_reward/std": 0.24784433841705322, + "rewards/format_reward/mean": 0.140625, + "rewards/format_reward/std": 0.3503824472427368, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1723.0, + "completions/mean_length": 1868.921875, + "completions/mean_terminated_length": 1092.916748046875, + "completions/min_length": 620.0, + "completions/min_terminated_length": 620.0, + "epoch": 0.04228571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24795544147491455, + "learning_rate": 9.825677631722435e-07, + "loss": -0.0, + "num_tokens": 4661890.0, + "reward": -0.23053905367851257, + "reward_std": 0.34036368131637573, + "rewards/cosine_scaled_reward/mean": -0.2246445268392563, + "rewards/cosine_scaled_reward/std": 0.15942412614822388, + "rewards/format_reward/mean": 0.21875, + "rewards/format_reward/std": 0.4166666865348816, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1397.0, + "completions/mean_length": 1889.53125, + "completions/mean_terminated_length": 1033.800048828125, + "completions/min_length": 810.0, + "completions/min_terminated_length": 810.0, + "epoch": 0.04342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24283826351165771, + "learning_rate": 9.80337140183366e-07, + "loss": 0.0, + "num_tokens": 4794532.0, + "reward": -0.10043507814407349, + "reward_std": 0.47925832867622375, + "rewards/cosine_scaled_reward/mean": -0.13615503907203674, + "rewards/cosine_scaled_reward/std": 0.3336707651615143, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.38025420904159546, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1515.0, + "completions/mean_length": 1644.828125, + "completions/mean_terminated_length": 689.9473876953125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.044571428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28362998366355896, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0, + "num_tokens": 4910585.0, + "reward": 0.12284853309392929, + "reward_std": 0.4183085858821869, + "rewards/cosine_scaled_reward/mean": -0.11045074462890625, + "rewards/cosine_scaled_reward/std": 0.30217844247817993, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1632.0, + "completions/mean_length": 1618.28125, + "completions/mean_terminated_length": 902.0833740234375, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.045714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.262617826461792, + "learning_rate": 9.754833590196926e-07, + "loss": 0.0, + "num_tokens": 5024227.0, + "reward": 0.2076582908630371, + "reward_std": 0.42125773429870605, + "rewards/cosine_scaled_reward/mean": -0.12273336946964264, + "rewards/cosine_scaled_reward/std": 0.4404613971710205, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.501733124256134, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1914.0, + "completions/mean_length": 1717.734375, + "completions/mean_terminated_length": 1235.0384521484375, + "completions/min_length": 664.0, + "completions/min_terminated_length": 664.0, + "epoch": 0.046857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23294499516487122, + "learning_rate": 9.728616793536587e-07, + "loss": -0.0, + "num_tokens": 5145314.0, + "reward": 0.011502981185913086, + "reward_std": 0.6816084980964661, + "rewards/cosine_scaled_reward/mean": -0.22081100940704346, + "rewards/cosine_scaled_reward/std": 0.37589573860168457, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.501733124256134, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1672.0, + "completions/mean_length": 1703.921875, + "completions/mean_terminated_length": 579.933349609375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34672290086746216, + "learning_rate": 9.701111919237408e-07, + "loss": -0.0, + "num_tokens": 5264725.0, + "reward": -0.2616002857685089, + "reward_std": 0.37952175736427307, + "rewards/cosine_scaled_reward/mean": -0.26361262798309326, + "rewards/cosine_scaled_reward/std": 0.17531204223632812, + "rewards/format_reward/mean": 0.265625, + "rewards/format_reward/std": 0.44515693187713623, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1370.0, + "completions/mean_length": 1681.84375, + "completions/mean_terminated_length": 814.631591796875, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.04914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.263967901468277, + "learning_rate": 9.672327345550543e-07, + "loss": -0.0, + "num_tokens": 5383979.0, + "reward": 0.13376155495643616, + "reward_std": 0.46012288331985474, + "rewards/cosine_scaled_reward/mean": -0.08155670762062073, + "rewards/cosine_scaled_reward/std": 0.3612325191497803, + "rewards/format_reward/mean": 0.296875, + "rewards/format_reward/std": 0.4604927599430084, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 1624.625, + "completions/mean_terminated_length": 869.9130859375, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.05028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28927963972091675, + "learning_rate": 9.64227184053598e-07, + "loss": -0.0, + "num_tokens": 5498651.0, + "reward": 0.20869271457195282, + "reward_std": 0.5558150410652161, + "rewards/cosine_scaled_reward/mean": -0.0987786278128624, + "rewards/cosine_scaled_reward/std": 0.42912590503692627, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.49501484632492065, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1851.0, + "completions/mean_length": 2006.96875, + "completions/mean_terminated_length": 1522.800048828125, + "completions/min_length": 955.0, + "completions/min_terminated_length": 955.0, + "epoch": 0.05142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24254000186920166, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0, + "num_tokens": 5638753.0, + "reward": -0.2540697157382965, + "reward_std": 0.4600578844547272, + "rewards/cosine_scaled_reward/mean": -0.20515984296798706, + "rewards/cosine_scaled_reward/std": 0.3251590430736542, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36596253514289856, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1563.0, + "completions/mean_length": 1765.984375, + "completions/mean_terminated_length": 919.9375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "epoch": 0.052571428571428575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2645930349826813, + "learning_rate": 9.578385041664925e-07, + "loss": 0.0, + "num_tokens": 5762944.0, + "reward": -0.213707834482193, + "reward_std": 0.38778313994407654, + "rewards/cosine_scaled_reward/mean": -0.2318539321422577, + "rewards/cosine_scaled_reward/std": 0.21436986327171326, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1583.40625, + "completions/mean_terminated_length": 986.0714721679688, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.053714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.311797559261322, + "learning_rate": 9.54457320834625e-07, + "loss": 0.0, + "num_tokens": 5874682.0, + "reward": 0.27925533056259155, + "reward_std": 0.6467443704605103, + "rewards/cosine_scaled_reward/mean": -0.07912233471870422, + "rewards/cosine_scaled_reward/std": 0.4737093150615692, + "rewards/format_reward/mean": 0.4375, + "rewards/format_reward/std": 0.5, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 1690.0625, + "completions/mean_terminated_length": 1006.727294921875, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.054857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26644304394721985, + "learning_rate": 9.509529358847654e-07, + "loss": -0.0, + "num_tokens": 5993390.0, + "reward": 0.13692031800746918, + "reward_std": 0.5655145049095154, + "rewards/cosine_scaled_reward/mean": -0.12685233354568481, + "rewards/cosine_scaled_reward/std": 0.32320985198020935, + "rewards/format_reward/mean": 0.390625, + "rewards/format_reward/std": 0.4917473793029785, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1387.140625, + "completions/mean_terminated_length": 804.0294189453125, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3078882396221161, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0, + "num_tokens": 6092231.0, + "reward": 0.35559189319610596, + "reward_std": 0.5927403569221497, + "rewards/cosine_scaled_reward/mean": -0.09564155340194702, + "rewards/cosine_scaled_reward/std": 0.4046906530857086, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.501733124256134, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 1674.890625, + "completions/mean_terminated_length": 962.5909423828125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.05714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23925544321537018, + "learning_rate": 9.43578868212728e-07, + "loss": -0.0, + "num_tokens": 6210240.0, + "reward": 0.18573230504989624, + "reward_std": 0.5264967083930969, + "rewards/cosine_scaled_reward/mean": -0.09463384002447128, + "rewards/cosine_scaled_reward/std": 0.4100942015647888, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.48795005679130554, + "step": 50 + } + ], + "logging_steps": 1, + "max_steps": 200, + "num_input_tokens_seen": 6210240, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-50/training_args.bin b/checkpoint-50/training_args.bin new file mode 100644 index 0000000..9e03ee7 --- /dev/null +++ b/checkpoint-50/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec4fbe7e29ae78abab9c9fd5555870c1bffe7656ceef4ac5fa2148a15b61b1e3 +size 8888 diff --git a/checkpoint-50/zero_to_fp32.py b/checkpoint-50/zero_to_fp32.py new file mode 100644 index 0000000..0e75914 --- /dev/null +++ b/checkpoint-50/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/config.json b/config.json new file mode 100644 index 0000000..ae35726 --- /dev/null +++ b/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": 4096, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..92878bd --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 151646, + "do_sample": true, + "eos_token_id": 151643, + "temperature": 0.6, + "top_p": 0.95, + "transformers_version": "4.51.3" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..24c1613 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:721e7cd7a52fbe85031e588ef9dd53b84820dc30295efc7a202ec5bf16e6a44d +size 3554214752 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..1d385d6 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..e7cd2c1 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a +size 11422959 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..ef6e98c --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,195 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..2c27fe5 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": 3.2957177609205244e-09, + "train_runtime": 10011.2078, + "train_samples": 7000, + "train_samples_per_second": 1.279, + "train_steps_per_second": 0.02 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..fff1ad1 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,5443 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.22857142857142856, + "eval_steps": 500, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1734.0, + "completions/mean_length": 1702.03125, + "completions/mean_terminated_length": 993.6190795898438, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.001142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2544386684894562, + "learning_rate": 0.0, + "loss": -0.0, + "num_tokens": 118418.0, + "reward": 0.17899775505065918, + "reward_std": 0.7650213241577148, + "rewards/cosine_scaled_reward/mean": -0.09800112992525101, + "rewards/cosine_scaled_reward/std": 0.37953105568885803, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.48795005679130554, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1894.0, + "completions/mean_length": 1738.90625, + "completions/mean_terminated_length": 949.0, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.002285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2436082512140274, + "learning_rate": 5e-08, + "loss": -0.0, + "num_tokens": 239748.0, + "reward": 0.3848632574081421, + "reward_std": 0.9111153483390808, + "rewards/cosine_scaled_reward/mean": 0.020556632429361343, + "rewards/cosine_scaled_reward/std": 0.4492928683757782, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1545.0, + "completions/mean_length": 1989.015625, + "completions/mean_terminated_length": 1104.25, + "completions/min_length": 706.0, + "completions/min_terminated_length": 706.0, + "epoch": 0.0034285714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2544717788696289, + "learning_rate": 1e-07, + "loss": -0.0, + "num_tokens": 377517.0, + "reward": -0.3279358148574829, + "reward_std": 0.33216947317123413, + "rewards/cosine_scaled_reward/mean": -0.20303040742874146, + "rewards/cosine_scaled_reward/std": 0.179075226187706, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.27048972249031067, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1566.421875, + "completions/mean_terminated_length": 1084.84375, + "completions/min_length": 502.0, + "completions/min_terminated_length": 502.0, + "epoch": 0.004571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28807103633880615, + "learning_rate": 1.5e-07, + "loss": -0.0, + "num_tokens": 487576.0, + "reward": 0.2716121971607208, + "reward_std": 0.6643469333648682, + "rewards/cosine_scaled_reward/mean": -0.12981891632080078, + "rewards/cosine_scaled_reward/std": 0.3019586503505707, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.5029674172401428, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1807.0, + "completions/mean_length": 1936.84375, + "completions/mean_terminated_length": 1031.71435546875, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "epoch": 0.005714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26783761382102966, + "learning_rate": 2e-07, + "loss": -0.0, + "num_tokens": 622350.0, + "reward": -0.3612896800041199, + "reward_std": 0.41048353910446167, + "rewards/cosine_scaled_reward/mean": -0.23533234000205994, + "rewards/cosine_scaled_reward/std": 0.20467400550842285, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.3145764470100403, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1301.0, + "completions/mean_length": 1889.453125, + "completions/mean_terminated_length": 779.625, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "epoch": 0.006857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.262518972158432, + "learning_rate": 2.5e-07, + "loss": 0.0, + "num_tokens": 754923.0, + "reward": -0.29250282049179077, + "reward_std": 0.5422531962394714, + "rewards/cosine_scaled_reward/mean": -0.22437641024589539, + "rewards/cosine_scaled_reward/std": 0.22509199380874634, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36596253514289856, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1568.0, + "completions/mean_length": 1921.921875, + "completions/mean_terminated_length": 1314.45458984375, + "completions/min_length": 927.0, + "completions/min_terminated_length": 927.0, + "epoch": 0.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22601397335529327, + "learning_rate": 3e-07, + "loss": 0.0, + "num_tokens": 888334.0, + "reward": 0.025340259075164795, + "reward_std": 0.7285393476486206, + "rewards/cosine_scaled_reward/mean": -0.1279548704624176, + "rewards/cosine_scaled_reward/std": 0.40222346782684326, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1736.859375, + "completions/mean_terminated_length": 999.9473876953125, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.009142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24552854895591736, + "learning_rate": 3.5e-07, + "loss": 0.0, + "num_tokens": 1009909.0, + "reward": 0.21729671955108643, + "reward_std": 0.6989120244979858, + "rewards/cosine_scaled_reward/mean": -0.055414143949747086, + "rewards/cosine_scaled_reward/std": 0.47493892908096313, + "rewards/format_reward/mean": 0.328125, + "rewards/format_reward/std": 0.4732423722743988, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 1967.53125, + "completions/mean_terminated_length": 1475.77783203125, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "epoch": 0.010285714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2430322915315628, + "learning_rate": 4e-07, + "loss": 0.0, + "num_tokens": 1147287.0, + "reward": -0.21451422572135925, + "reward_std": 0.587526798248291, + "rewards/cosine_scaled_reward/mean": -0.19319462776184082, + "rewards/cosine_scaled_reward/std": 0.29357606172561646, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.38025420904159546, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 1708.546875, + "completions/mean_terminated_length": 961.75, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.011428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2543582320213318, + "learning_rate": 4.5e-07, + "loss": 0.0, + "num_tokens": 1267466.0, + "reward": 0.02539752423763275, + "reward_std": 0.545810341835022, + "rewards/cosine_scaled_reward/mean": -0.14355123043060303, + "rewards/cosine_scaled_reward/std": 0.36147356033325195, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.467176616191864, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1579.0, + "completions/mean_length": 1967.734375, + "completions/mean_terminated_length": 1191.8333740234375, + "completions/min_length": 843.0, + "completions/min_terminated_length": 843.0, + "epoch": 0.012571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24583907425403595, + "learning_rate": 5e-07, + "loss": -0.0, + "num_tokens": 1405073.0, + "reward": -0.46971434354782104, + "reward_std": 0.36104393005371094, + "rewards/cosine_scaled_reward/mean": -0.28173214197158813, + "rewards/cosine_scaled_reward/std": 0.17775526642799377, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29378482699394226, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1862.0, + "completions/mean_length": 1707.5625, + "completions/mean_terminated_length": 1176.47998046875, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.013714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3135142922401428, + "learning_rate": 5.5e-07, + "loss": -0.0, + "num_tokens": 1525301.0, + "reward": 0.0018395520746707916, + "reward_std": 0.7012988328933716, + "rewards/cosine_scaled_reward/mean": -0.21783021092414856, + "rewards/cosine_scaled_reward/std": 0.324150949716568, + "rewards/format_reward/mean": 0.4375, + "rewards/format_reward/std": 0.5, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1745.0, + "completions/mean_length": 1841.96875, + "completions/mean_terminated_length": 1168.933349609375, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "epoch": 0.014857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2532394826412201, + "learning_rate": 6e-07, + "loss": -0.0, + "num_tokens": 1654227.0, + "reward": -0.10322706401348114, + "reward_std": 0.6915165185928345, + "rewards/cosine_scaled_reward/mean": -0.17661353945732117, + "rewards/cosine_scaled_reward/std": 0.329875111579895, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1816.390625, + "completions/mean_terminated_length": 1306.8499755859375, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "epoch": 0.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28405147790908813, + "learning_rate": 6.5e-07, + "loss": 0.0, + "num_tokens": 1781084.0, + "reward": 0.10602855682373047, + "reward_std": 0.630502462387085, + "rewards/cosine_scaled_reward/mean": -0.11104822158813477, + "rewards/cosine_scaled_reward/std": 0.3846627473831177, + "rewards/format_reward/mean": 0.328125, + "rewards/format_reward/std": 0.4732423722743988, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1574.0, + "completions/mean_length": 1702.109375, + "completions/mean_terminated_length": 818.1666870117188, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.017142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28779250383377075, + "learning_rate": 7e-07, + "loss": 0.0, + "num_tokens": 1900939.0, + "reward": 0.32734519243240356, + "reward_std": 0.3870265483856201, + "rewards/cosine_scaled_reward/mean": 0.007422588765621185, + "rewards/cosine_scaled_reward/std": 0.45787373185157776, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.467176616191864, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.018285714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2337152510881424, + "learning_rate": 7.5e-07, + "loss": -0.0, + "num_tokens": 2042451.0, + "reward": -0.5429925918579102, + "reward_std": 0.3153150975704193, + "rewards/cosine_scaled_reward/mean": -0.2714962661266327, + "rewards/cosine_scaled_reward/std": 0.1678173691034317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1879.0, + "completions/mean_length": 1564.921875, + "completions/mean_terminated_length": 858.8846435546875, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.019428571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33599403500556946, + "learning_rate": 8e-07, + "loss": -0.0, + "num_tokens": 2153126.0, + "reward": 0.17696775496006012, + "reward_std": 0.6489306688308716, + "rewards/cosine_scaled_reward/mean": -0.11464111506938934, + "rewards/cosine_scaled_reward/std": 0.3551919758319855, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.49501484632492065, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1298.0, + "completions/mean_length": 1795.390625, + "completions/mean_terminated_length": 893.21435546875, + "completions/min_length": 619.0, + "completions/min_terminated_length": 619.0, + "epoch": 0.02057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22697053849697113, + "learning_rate": 8.499999999999999e-07, + "loss": -0.0, + "num_tokens": 2278407.0, + "reward": -0.10711958259344101, + "reward_std": 0.5238703489303589, + "rewards/cosine_scaled_reward/mean": -0.1785597801208496, + "rewards/cosine_scaled_reward/std": 0.2545098662376404, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 1921.484375, + "completions/mean_terminated_length": 1238.300048828125, + "completions/min_length": 623.0, + "completions/min_terminated_length": 623.0, + "epoch": 0.021714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23972108960151672, + "learning_rate": 9e-07, + "loss": 0.0, + "num_tokens": 2412638.0, + "reward": 0.029344379901885986, + "reward_std": 0.6719281077384949, + "rewards/cosine_scaled_reward/mean": -0.086890310049057, + "rewards/cosine_scaled_reward/std": 0.40220555663108826, + "rewards/format_reward/mean": 0.203125, + "rewards/format_reward/std": 0.40550529956817627, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1728.5625, + "completions/mean_terminated_length": 845.4117431640625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.022857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23309311270713806, + "learning_rate": 9.499999999999999e-07, + "loss": 0.0, + "num_tokens": 2534618.0, + "reward": 0.0131673663854599, + "reward_std": 0.4436222314834595, + "rewards/cosine_scaled_reward/mean": -0.13404130935668945, + "rewards/cosine_scaled_reward/std": 0.32819250226020813, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 1777.953125, + "completions/mean_terminated_length": 1087.8333740234375, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29990270733833313, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 2659215.0, + "reward": -0.1764472872018814, + "reward_std": 0.5121938586235046, + "rewards/cosine_scaled_reward/mean": -0.2444736361503601, + "rewards/cosine_scaled_reward/std": 0.289971262216568, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.467176616191864, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 1361.28125, + "completions/mean_terminated_length": 921.0769653320312, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.025142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29922786355018616, + "learning_rate": 9.99931462820376e-07, + "loss": -0.0, + "num_tokens": 2755353.0, + "reward": 0.6089149713516235, + "reward_std": 0.5986809730529785, + "rewards/cosine_scaled_reward/mean": -0.05491749942302704, + "rewards/cosine_scaled_reward/std": 0.39076483249664307, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4531635046005249, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1565.046875, + "completions/mean_terminated_length": 903.2222290039062, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.026285714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27512773871421814, + "learning_rate": 9.997258721585931e-07, + "loss": -0.0, + "num_tokens": 2866308.0, + "reward": 0.21871733665466309, + "reward_std": 0.5976030826568604, + "rewards/cosine_scaled_reward/mean": -0.10157884657382965, + "rewards/cosine_scaled_reward/std": 0.3856185972690582, + "rewards/format_reward/mean": 0.421875, + "rewards/format_reward/std": 0.49776285886764526, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 1801.671875, + "completions/mean_terminated_length": 1259.75, + "completions/min_length": 573.0, + "completions/min_terminated_length": 573.0, + "epoch": 0.027428571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22642865777015686, + "learning_rate": 9.993832906395582e-07, + "loss": -0.0, + "num_tokens": 2992543.0, + "reward": 0.04899948835372925, + "reward_std": 0.8525694608688354, + "rewards/cosine_scaled_reward/mean": -0.17081275582313538, + "rewards/cosine_scaled_reward/std": 0.3993513882160187, + "rewards/format_reward/mean": 0.390625, + "rewards/format_reward/std": 0.4917473793029785, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1715.765625, + "completions/mean_terminated_length": 1035.4761962890625, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.02857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25316134095191956, + "learning_rate": 9.989038226169207e-07, + "loss": -0.0, + "num_tokens": 3112648.0, + "reward": 0.10585837811231613, + "reward_std": 0.7828943729400635, + "rewards/cosine_scaled_reward/mean": -0.11894579976797104, + "rewards/cosine_scaled_reward/std": 0.4141720235347748, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 1917.703125, + "completions/mean_terminated_length": 1452.357177734375, + "completions/min_length": 840.0, + "completions/min_terminated_length": 840.0, + "epoch": 0.029714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2521306574344635, + "learning_rate": 9.982876141412855e-07, + "loss": -0.0, + "num_tokens": 3246013.0, + "reward": 0.17620250582695007, + "reward_std": 0.6548349857330322, + "rewards/cosine_scaled_reward/mean": -0.08377375453710556, + "rewards/cosine_scaled_reward/std": 0.3527655303478241, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1851.015625, + "completions/mean_terminated_length": 1147.5, + "completions/min_length": 506.0, + "completions/min_terminated_length": 506.0, + "epoch": 0.030857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2730060815811157, + "learning_rate": 9.975348529157229e-07, + "loss": -0.0, + "num_tokens": 3374766.0, + "reward": -0.18854813277721405, + "reward_std": 0.49348777532577515, + "rewards/cosine_scaled_reward/mean": -0.21146157383918762, + "rewards/cosine_scaled_reward/std": 0.2601618766784668, + "rewards/format_reward/mean": 0.234375, + "rewards/format_reward/std": 0.42695629596710205, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1798.328125, + "completions/mean_terminated_length": 1049.3125, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2566036880016327, + "learning_rate": 9.96645768238595e-07, + "loss": 0.0, + "num_tokens": 3500195.0, + "reward": 0.06705980002880096, + "reward_std": 0.7090284824371338, + "rewards/cosine_scaled_reward/mean": -0.10709509253501892, + "rewards/cosine_scaled_reward/std": 0.4101051986217499, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1930.203125, + "completions/mean_terminated_length": 1210.3333740234375, + "completions/min_length": 582.0, + "completions/min_terminated_length": 582.0, + "epoch": 0.03314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25197461247444153, + "learning_rate": 9.956206309337066e-07, + "loss": 0.0, + "num_tokens": 3634200.0, + "reward": -0.2462695688009262, + "reward_std": 0.5237302780151367, + "rewards/cosine_scaled_reward/mean": -0.2012597918510437, + "rewards/cosine_scaled_reward/std": 0.23252712190151215, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36596253514289856, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1900.0, + "completions/mean_length": 1847.65625, + "completions/mean_terminated_length": 1061.6923828125, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "epoch": 0.03428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30431485176086426, + "learning_rate": 9.944597532678119e-07, + "loss": 0.0, + "num_tokens": 3762986.0, + "reward": -0.05392302945256233, + "reward_std": 0.7249555587768555, + "rewards/cosine_scaled_reward/mean": -0.15196150541305542, + "rewards/cosine_scaled_reward/std": 0.34566983580589294, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1860.0, + "completions/mean_length": 1838.671875, + "completions/mean_terminated_length": 931.5833740234375, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.03542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2484513372182846, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0, + "num_tokens": 3891157.0, + "reward": -0.11271396279335022, + "reward_std": 0.6705260872840881, + "rewards/cosine_scaled_reward/mean": -0.1813569962978363, + "rewards/cosine_scaled_reward/std": 0.4071698486804962, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1715.0, + "completions/mean_length": 1910.109375, + "completions/mean_terminated_length": 1417.6429443359375, + "completions/min_length": 906.0, + "completions/min_terminated_length": 906.0, + "epoch": 0.036571428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25329527258872986, + "learning_rate": 9.917322325514487e-07, + "loss": -0.0, + "num_tokens": 4023756.0, + "reward": -0.08931556344032288, + "reward_std": 0.6381070613861084, + "rewards/cosine_scaled_reward/mean": -0.16965776681900024, + "rewards/cosine_scaled_reward/std": 0.37385129928588867, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1865.0, + "completions/mean_length": 2023.71875, + "completions/mean_terminated_length": 1530.0, + "completions/min_length": 1107.0, + "completions/min_terminated_length": 1107.0, + "epoch": 0.037714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22758109867572784, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0, + "num_tokens": 4164490.0, + "reward": -0.4589868187904358, + "reward_std": 0.5177067518234253, + "rewards/cosine_scaled_reward/mean": -0.2919934093952179, + "rewards/cosine_scaled_reward/std": 0.2252870500087738, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3333333432674408, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1454.78125, + "completions/mean_terminated_length": 963.2571411132812, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.038857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3234354257583618, + "learning_rate": 9.88466529153356e-07, + "loss": 0.0, + "num_tokens": 4267148.0, + "reward": 0.656031608581543, + "reward_std": 0.7529654502868652, + "rewards/cosine_scaled_reward/mean": 0.05457830801606178, + "rewards/cosine_scaled_reward/std": 0.49684229493141174, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.501733124256134, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1724.0, + "completions/mean_length": 1819.078125, + "completions/mean_terminated_length": 716.0909423828125, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "epoch": 0.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2821458876132965, + "learning_rate": 9.866330768241983e-07, + "loss": -0.0, + "num_tokens": 4395065.0, + "reward": -0.09630556404590607, + "reward_std": 0.7089139223098755, + "rewards/cosine_scaled_reward/mean": -0.15752778947353363, + "rewards/cosine_scaled_reward/std": 0.3647947609424591, + "rewards/format_reward/mean": 0.21875, + "rewards/format_reward/std": 0.4166666865348816, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1811.0, + "completions/mean_length": 1954.34375, + "completions/mean_terminated_length": 1382.0, + "completions/min_length": 949.0, + "completions/min_terminated_length": 949.0, + "epoch": 0.04114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24163897335529327, + "learning_rate": 9.846666218300807e-07, + "loss": -0.0, + "num_tokens": 4531255.0, + "reward": -0.34593287110328674, + "reward_std": 0.44493502378463745, + "rewards/cosine_scaled_reward/mean": -0.24327893555164337, + "rewards/cosine_scaled_reward/std": 0.24784433841705322, + "rewards/format_reward/mean": 0.140625, + "rewards/format_reward/std": 0.3503824472427368, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1723.0, + "completions/mean_length": 1868.921875, + "completions/mean_terminated_length": 1092.916748046875, + "completions/min_length": 620.0, + "completions/min_terminated_length": 620.0, + "epoch": 0.04228571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24795544147491455, + "learning_rate": 9.825677631722435e-07, + "loss": -0.0, + "num_tokens": 4661890.0, + "reward": -0.23053905367851257, + "reward_std": 0.34036368131637573, + "rewards/cosine_scaled_reward/mean": -0.2246445268392563, + "rewards/cosine_scaled_reward/std": 0.15942412614822388, + "rewards/format_reward/mean": 0.21875, + "rewards/format_reward/std": 0.4166666865348816, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1397.0, + "completions/mean_length": 1889.53125, + "completions/mean_terminated_length": 1033.800048828125, + "completions/min_length": 810.0, + "completions/min_terminated_length": 810.0, + "epoch": 0.04342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24283826351165771, + "learning_rate": 9.80337140183366e-07, + "loss": 0.0, + "num_tokens": 4794532.0, + "reward": -0.10043507814407349, + "reward_std": 0.47925832867622375, + "rewards/cosine_scaled_reward/mean": -0.13615503907203674, + "rewards/cosine_scaled_reward/std": 0.3336707651615143, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.38025420904159546, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1515.0, + "completions/mean_length": 1644.828125, + "completions/mean_terminated_length": 689.9473876953125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.044571428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28362998366355896, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0, + "num_tokens": 4910585.0, + "reward": 0.12284853309392929, + "reward_std": 0.4183085858821869, + "rewards/cosine_scaled_reward/mean": -0.11045074462890625, + "rewards/cosine_scaled_reward/std": 0.30217844247817993, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4787135720252991, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1632.0, + "completions/mean_length": 1618.28125, + "completions/mean_terminated_length": 902.0833740234375, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.045714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.262617826461792, + "learning_rate": 9.754833590196926e-07, + "loss": 0.0, + "num_tokens": 5024227.0, + "reward": 0.2076582908630371, + "reward_std": 0.42125773429870605, + "rewards/cosine_scaled_reward/mean": -0.12273336946964264, + "rewards/cosine_scaled_reward/std": 0.4404613971710205, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.501733124256134, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1914.0, + "completions/mean_length": 1717.734375, + "completions/mean_terminated_length": 1235.0384521484375, + "completions/min_length": 664.0, + "completions/min_terminated_length": 664.0, + "epoch": 0.046857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23294499516487122, + "learning_rate": 9.728616793536587e-07, + "loss": -0.0, + "num_tokens": 5145314.0, + "reward": 0.011502981185913086, + "reward_std": 0.6816084980964661, + "rewards/cosine_scaled_reward/mean": -0.22081100940704346, + "rewards/cosine_scaled_reward/std": 0.37589573860168457, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.501733124256134, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1672.0, + "completions/mean_length": 1703.921875, + "completions/mean_terminated_length": 579.933349609375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34672290086746216, + "learning_rate": 9.701111919237408e-07, + "loss": -0.0, + "num_tokens": 5264725.0, + "reward": -0.2616002857685089, + "reward_std": 0.37952175736427307, + "rewards/cosine_scaled_reward/mean": -0.26361262798309326, + "rewards/cosine_scaled_reward/std": 0.17531204223632812, + "rewards/format_reward/mean": 0.265625, + "rewards/format_reward/std": 0.44515693187713623, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1370.0, + "completions/mean_length": 1681.84375, + "completions/mean_terminated_length": 814.631591796875, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.04914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.263967901468277, + "learning_rate": 9.672327345550543e-07, + "loss": -0.0, + "num_tokens": 5383979.0, + "reward": 0.13376155495643616, + "reward_std": 0.46012288331985474, + "rewards/cosine_scaled_reward/mean": -0.08155670762062073, + "rewards/cosine_scaled_reward/std": 0.3612325191497803, + "rewards/format_reward/mean": 0.296875, + "rewards/format_reward/std": 0.4604927599430084, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 1624.625, + "completions/mean_terminated_length": 869.9130859375, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.05028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28927963972091675, + "learning_rate": 9.64227184053598e-07, + "loss": -0.0, + "num_tokens": 5498651.0, + "reward": 0.20869271457195282, + "reward_std": 0.5558150410652161, + "rewards/cosine_scaled_reward/mean": -0.0987786278128624, + "rewards/cosine_scaled_reward/std": 0.42912590503692627, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.49501484632492065, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1851.0, + "completions/mean_length": 2006.96875, + "completions/mean_terminated_length": 1522.800048828125, + "completions/min_length": 955.0, + "completions/min_terminated_length": 955.0, + "epoch": 0.05142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24254000186920166, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0, + "num_tokens": 5638753.0, + "reward": -0.2540697157382965, + "reward_std": 0.4600578844547272, + "rewards/cosine_scaled_reward/mean": -0.20515984296798706, + "rewards/cosine_scaled_reward/std": 0.3251590430736542, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36596253514289856, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1563.0, + "completions/mean_length": 1765.984375, + "completions/mean_terminated_length": 919.9375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "epoch": 0.052571428571428575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2645930349826813, + "learning_rate": 9.578385041664925e-07, + "loss": 0.0, + "num_tokens": 5762944.0, + "reward": -0.213707834482193, + "reward_std": 0.38778313994407654, + "rewards/cosine_scaled_reward/mean": -0.2318539321422577, + "rewards/cosine_scaled_reward/std": 0.21436986327171326, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4364357888698578, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1583.40625, + "completions/mean_terminated_length": 986.0714721679688, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.053714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.311797559261322, + "learning_rate": 9.54457320834625e-07, + "loss": 0.0, + "num_tokens": 5874682.0, + "reward": 0.27925533056259155, + "reward_std": 0.6467443704605103, + "rewards/cosine_scaled_reward/mean": -0.07912233471870422, + "rewards/cosine_scaled_reward/std": 0.4737093150615692, + "rewards/format_reward/mean": 0.4375, + "rewards/format_reward/std": 0.5, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 1690.0625, + "completions/mean_terminated_length": 1006.727294921875, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.054857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26644304394721985, + "learning_rate": 9.509529358847654e-07, + "loss": -0.0, + "num_tokens": 5993390.0, + "reward": 0.13692031800746918, + "reward_std": 0.5655145049095154, + "rewards/cosine_scaled_reward/mean": -0.12685233354568481, + "rewards/cosine_scaled_reward/std": 0.32320985198020935, + "rewards/format_reward/mean": 0.390625, + "rewards/format_reward/std": 0.4917473793029785, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1387.140625, + "completions/mean_terminated_length": 804.0294189453125, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3078882396221161, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0, + "num_tokens": 6092231.0, + "reward": 0.35559189319610596, + "reward_std": 0.5927403569221497, + "rewards/cosine_scaled_reward/mean": -0.09564155340194702, + "rewards/cosine_scaled_reward/std": 0.4046906530857086, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.501733124256134, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 1674.890625, + "completions/mean_terminated_length": 962.5909423828125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.05714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23925544321537018, + "learning_rate": 9.43578868212728e-07, + "loss": -0.0, + "num_tokens": 6210240.0, + "reward": 0.18573230504989624, + "reward_std": 0.5264967083930969, + "rewards/cosine_scaled_reward/mean": -0.09463384002447128, + "rewards/cosine_scaled_reward/std": 0.4100942015647888, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.48795005679130554, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1347.40625, + "completions/mean_terminated_length": 836.1621704101562, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.05828571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.325811505317688, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0, + "num_tokens": 6306682.0, + "reward": 0.1735648661851883, + "reward_std": 0.5335988998413086, + "rewards/cosine_scaled_reward/mean": -0.21009255945682526, + "rewards/cosine_scaled_reward/std": 0.2623959481716156, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.49501484632492065, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1390.0, + "completions/mean_length": 1727.765625, + "completions/mean_terminated_length": 767.0625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.05942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27392977476119995, + "learning_rate": 9.357252853159505e-07, + "loss": 0.0, + "num_tokens": 6428611.0, + "reward": -0.16267812252044678, + "reward_std": 0.5682471990585327, + "rewards/cosine_scaled_reward/mean": -0.2219640612602234, + "rewards/cosine_scaled_reward/std": 0.36739134788513184, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4531635046005249, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1907.0, + "completions/mean_length": 1609.171875, + "completions/mean_terminated_length": 924.5999755859375, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "epoch": 0.060571428571428575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28155064582824707, + "learning_rate": 9.316216432703916e-07, + "loss": -0.0, + "num_tokens": 6542430.0, + "reward": 0.0752667784690857, + "reward_std": 0.7118167281150818, + "rewards/cosine_scaled_reward/mean": -0.18892911076545715, + "rewards/cosine_scaled_reward/std": 0.3222156763076782, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.501733124256134, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1588.234375, + "completions/mean_terminated_length": 1067.166748046875, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "epoch": 0.061714285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2555343806743622, + "learning_rate": 9.274017555754407e-07, + "loss": 0.0, + "num_tokens": 6655221.0, + "reward": 0.6341299414634705, + "reward_std": 1.0656921863555908, + "rewards/cosine_scaled_reward/mean": 0.05143994837999344, + "rewards/cosine_scaled_reward/std": 0.5348308086395264, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.5029674172401428, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1420.0, + "completions/mean_length": 1549.5625, + "completions/mean_terminated_length": 821.0769653320312, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "epoch": 0.06285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30243629217147827, + "learning_rate": 9.230669076497687e-07, + "loss": -0.0, + "num_tokens": 6764681.0, + "reward": 0.13021975755691528, + "reward_std": 0.3984764516353607, + "rewards/cosine_scaled_reward/mean": -0.13801513612270355, + "rewards/cosine_scaled_reward/std": 0.41228073835372925, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.49501484632492065, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1633.25, + "completions/mean_terminated_length": 1132.689697265625, + "completions/min_length": 543.0, + "completions/min_terminated_length": 543.0, + "epoch": 0.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23835402727127075, + "learning_rate": 9.186184199300463e-07, + "loss": -0.0, + "num_tokens": 6880169.0, + "reward": 0.27981996536254883, + "reward_std": 0.5018116235733032, + "rewards/cosine_scaled_reward/mean": -0.10227750986814499, + "rewards/cosine_scaled_reward/std": 0.481824666261673, + "rewards/format_reward/mean": 0.484375, + "rewards/format_reward/std": 0.5037065148353577, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1807.0, + "completions/mean_length": 1699.875, + "completions/mean_terminated_length": 1156.7999267578125, + "completions/min_length": 642.0, + "completions/min_terminated_length": 642.0, + "epoch": 0.06514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22349494695663452, + "learning_rate": 9.140576474687263e-07, + "loss": 0.0, + "num_tokens": 7000529.0, + "reward": -0.026505012065172195, + "reward_std": 0.5785415172576904, + "rewards/cosine_scaled_reward/mean": -0.20856501162052155, + "rewards/cosine_scaled_reward/std": 0.2749907374382019, + "rewards/format_reward/mean": 0.390625, + "rewards/format_reward/std": 0.4917473793029785, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1457.875, + "completions/mean_terminated_length": 1054.105224609375, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "epoch": 0.06628571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.261942595243454, + "learning_rate": 9.093859795212817e-07, + "loss": 0.0, + "num_tokens": 7103929.0, + "reward": 0.5745843648910522, + "reward_std": 0.8671218156814575, + "rewards/cosine_scaled_reward/mean": -0.03302033245563507, + "rewards/cosine_scaled_reward/std": 0.45529407262802124, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4836103618144989, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1590.0625, + "completions/mean_terminated_length": 1159.8787841796875, + "completions/min_length": 591.0, + "completions/min_terminated_length": 591.0, + "epoch": 0.06742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24828943610191345, + "learning_rate": 9.046048391230247e-07, + "loss": -0.0, + "num_tokens": 7216157.0, + "reward": 0.3377103805541992, + "reward_std": 0.5543617010116577, + "rewards/cosine_scaled_reward/mean": -0.1045822948217392, + "rewards/cosine_scaled_reward/std": 0.39040952920913696, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.501733124256134, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 1622.84375, + "completions/mean_terminated_length": 1076.21435546875, + "completions/min_length": 555.0, + "completions/min_terminated_length": 555.0, + "epoch": 0.06857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2752656936645508, + "learning_rate": 8.997156826556369e-07, + "loss": -0.0, + "num_tokens": 7330907.0, + "reward": 0.11114693433046341, + "reward_std": 0.6926254034042358, + "rewards/cosine_scaled_reward/mean": -0.1788015365600586, + "rewards/cosine_scaled_reward/std": 0.39409172534942627, + "rewards/format_reward/mean": 0.46875, + "rewards/format_reward/std": 0.5029674172401428, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1708.859375, + "completions/mean_terminated_length": 1014.4285888671875, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.06971428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22669929265975952, + "learning_rate": 8.9471999940354e-07, + "loss": -0.0, + "num_tokens": 7451794.0, + "reward": 0.2345120906829834, + "reward_std": 0.6293160319328308, + "rewards/cosine_scaled_reward/mean": -0.1093064472079277, + "rewards/cosine_scaled_reward/std": 0.29189831018447876, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.501733124256134, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1281.53125, + "completions/mean_terminated_length": 1004.2978515625, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.07085714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25438693165779114, + "learning_rate": 8.896193111002475e-07, + "loss": 0.0, + "num_tokens": 7544044.0, + "reward": 0.9180847406387329, + "reward_std": 0.6390912532806396, + "rewards/cosine_scaled_reward/mean": 0.06841734796762466, + "rewards/cosine_scaled_reward/std": 0.48315128684043884, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4166666865348816, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1678.0, + "completions/mean_length": 1310.46875, + "completions/mean_terminated_length": 896.731689453125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28795576095581055, + "learning_rate": 8.844151714648274e-07, + "loss": -0.0, + "num_tokens": 7638170.0, + "reward": 0.6488770246505737, + "reward_std": 0.7876260876655579, + "rewards/cosine_scaled_reward/mean": -0.019311510026454926, + "rewards/cosine_scaled_reward/std": 0.4736698865890503, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.467176616191864, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 1307.625, + "completions/mean_terminated_length": 1039.8297119140625, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.07314285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25637197494506836, + "learning_rate": 8.791091657286267e-07, + "loss": -0.0, + "num_tokens": 7732810.0, + "reward": 0.8280279636383057, + "reward_std": 0.6804471015930176, + "rewards/cosine_scaled_reward/mean": 0.015576483681797981, + "rewards/cosine_scaled_reward/std": 0.44819310307502747, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.40550529956817627, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1846.0, + "completions/mean_length": 1322.125, + "completions/mean_terminated_length": 914.9268188476562, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.07428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2944399118423462, + "learning_rate": 8.737029101523929e-07, + "loss": -0.0, + "num_tokens": 7828130.0, + "reward": 0.15610456466674805, + "reward_std": 0.4606686234474182, + "rewards/cosine_scaled_reward/mean": -0.24226020276546478, + "rewards/cosine_scaled_reward/std": 0.33131492137908936, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4836103618144989, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1803.0, + "completions/mean_length": 1020.21875, + "completions/mean_terminated_length": 806.9057006835938, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.07542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32644009590148926, + "learning_rate": 8.681980515339463e-07, + "loss": 0.0, + "num_tokens": 7903656.0, + "reward": 0.7972471714019775, + "reward_std": 0.7674820423126221, + "rewards/cosine_scaled_reward/mean": -0.031063925474882126, + "rewards/cosine_scaled_reward/std": 0.5106223225593567, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1910.0, + "completions/mean_length": 1750.859375, + "completions/mean_terminated_length": 1142.4285888671875, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "epoch": 0.07657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2270829975605011, + "learning_rate": 8.625962667065487e-07, + "loss": 0.0, + "num_tokens": 8026447.0, + "reward": -0.1400720775127411, + "reward_std": 0.3325888514518738, + "rewards/cosine_scaled_reward/mean": -0.24972353875637054, + "rewards/cosine_scaled_reward/std": 0.16404789686203003, + "rewards/format_reward/mean": 0.359375, + "rewards/format_reward/std": 0.4836103618144989, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1424.0, + "completions/mean_length": 769.546875, + "completions/mean_terminated_length": 637.2930908203125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.07771428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37025144696235657, + "learning_rate": 8.568992620281243e-07, + "loss": -0.0, + "num_tokens": 8084954.0, + "reward": 0.9792699813842773, + "reward_std": 0.804767370223999, + "rewards/cosine_scaled_reward/mean": 0.03651002421975136, + "rewards/cosine_scaled_reward/std": 0.46041443943977356, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1701.0, + "completions/mean_length": 1086.234375, + "completions/mean_terminated_length": 886.6226806640625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.07885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3763800263404846, + "learning_rate": 8.511087728614862e-07, + "loss": 0.0, + "num_tokens": 8164817.0, + "reward": 0.35803771018981934, + "reward_std": 0.5702667236328125, + "rewards/cosine_scaled_reward/mean": -0.24285613000392914, + "rewards/cosine_scaled_reward/std": 0.3019825220108032, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1898.0, + "completions/mean_length": 1463.375, + "completions/mean_terminated_length": 1112.5999755859375, + "completions/min_length": 503.0, + "completions/min_terminated_length": 503.0, + "epoch": 0.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24232418835163116, + "learning_rate": 8.452265630457282e-07, + "loss": -0.0, + "num_tokens": 8269929.0, + "reward": 0.3703588843345642, + "reward_std": 0.7288752794265747, + "rewards/cosine_scaled_reward/mean": -0.1351330280303955, + "rewards/cosine_scaled_reward/std": 0.3751916289329529, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4836103618144989, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1409.859375, + "completions/mean_terminated_length": 973.2368774414062, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.08114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.300010621547699, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0, + "num_tokens": 8370880.0, + "reward": 0.5196826457977295, + "reward_std": 0.7097917795181274, + "rewards/cosine_scaled_reward/mean": -0.044846177101135254, + "rewards/cosine_scaled_reward/std": 0.508389949798584, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4917473793029785, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1801.0, + "completions/mean_length": 1228.046875, + "completions/mean_terminated_length": 931.4680786132812, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.08228571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30454304814338684, + "learning_rate": 8.331941759724268e-07, + "loss": -0.0, + "num_tokens": 8459827.0, + "reward": 0.41365131735801697, + "reward_std": 0.5005639791488647, + "rewards/cosine_scaled_reward/mean": -0.1759868562221527, + "rewards/cosine_scaled_reward/std": 0.19868774712085724, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42695629596710205, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1719.0, + "completions/mean_length": 1513.28125, + "completions/mean_terminated_length": 1192.4500732421875, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "epoch": 0.08342857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27848970890045166, + "learning_rate": 8.270476638965461e-07, + "loss": -0.0, + "num_tokens": 8567405.0, + "reward": 0.09570223093032837, + "reward_std": 0.5445049405097961, + "rewards/cosine_scaled_reward/mean": -0.2802739143371582, + "rewards/cosine_scaled_reward/std": 0.25603488087654114, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4787135720252991, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1888.0, + "completions/mean_length": 1240.125, + "completions/mean_terminated_length": 924.0, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "epoch": 0.08457142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2789021134376526, + "learning_rate": 8.208167604184217e-07, + "loss": 0.0, + "num_tokens": 8656701.0, + "reward": 0.7823752760887146, + "reward_std": 0.6479132175445557, + "rewards/cosine_scaled_reward/mean": 0.031812600791454315, + "rewards/cosine_scaled_reward/std": 0.5397623181343079, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4531635046005249, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1455.953125, + "completions/mean_terminated_length": 1186.8409423828125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "epoch": 0.08571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22443196177482605, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0, + "num_tokens": 8760842.0, + "reward": 0.8040015697479248, + "reward_std": 0.5675323009490967, + "rewards/cosine_scaled_reward/mean": 0.027000809088349342, + "rewards/cosine_scaled_reward/std": 0.5096040964126587, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4364357888698578, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1720.0, + "completions/mean_length": 1177.859375, + "completions/mean_terminated_length": 863.1276245117188, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.08685714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32647648453712463, + "learning_rate": 8.081093963579707e-07, + "loss": 0.0, + "num_tokens": 8846625.0, + "reward": 0.310506671667099, + "reward_std": 0.5110941529273987, + "rewards/cosine_scaled_reward/mean": -0.2119341641664505, + "rewards/cosine_scaled_reward/std": 0.24737994372844696, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44515693187713623, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1787.0, + "completions/mean_length": 1263.4375, + "completions/mean_terminated_length": 1043.760009765625, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "epoch": 0.088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2545543611049652, + "learning_rate": 8.01636806561836e-07, + "loss": -0.0, + "num_tokens": 8939061.0, + "reward": 0.5484907031059265, + "reward_std": 0.48998576402664185, + "rewards/cosine_scaled_reward/mean": -0.13200464844703674, + "rewards/cosine_scaled_reward/std": 0.3430649936199188, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.39339789748191833, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1460.78125, + "completions/mean_terminated_length": 1059.0, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.08914285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2583931088447571, + "learning_rate": 7.950875657567621e-07, + "loss": 0.0, + "num_tokens": 9043271.0, + "reward": 0.6075442433357239, + "reward_std": 0.6895643472671509, + "rewards/cosine_scaled_reward/mean": -0.0009153857827186584, + "rewards/cosine_scaled_reward/std": 0.48922818899154663, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4917473793029785, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1956.0, + "completions/mean_length": 1054.875, + "completions/mean_terminated_length": 892.3635864257812, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.09028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29089078307151794, + "learning_rate": 7.884636689049422e-07, + "loss": 0.0, + "num_tokens": 9120879.0, + "reward": 0.6885831356048584, + "reward_std": 0.508629322052002, + "rewards/cosine_scaled_reward/mean": -0.09320840239524841, + "rewards/cosine_scaled_reward/std": 0.38835227489471436, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1399.046875, + "completions/mean_terminated_length": 1145.1087646484375, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "epoch": 0.09142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27458345890045166, + "learning_rate": 7.817671337095244e-07, + "loss": 0.0, + "num_tokens": 9220810.0, + "reward": 0.5549384355545044, + "reward_std": 0.7092134952545166, + "rewards/cosine_scaled_reward/mean": -0.09753081202507019, + "rewards/cosine_scaled_reward/std": 0.4125780463218689, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4364357888698578, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1833.0, + "completions/mean_length": 1084.984375, + "completions/mean_terminated_length": 906.6481323242188, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.09257142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37247684597969055, + "learning_rate": 7.75e-07, + "loss": -0.0, + "num_tokens": 9301521.0, + "reward": 0.5357480049133301, + "reward_std": 0.5661624670028687, + "rewards/cosine_scaled_reward/mean": -0.18525099754333496, + "rewards/cosine_scaled_reward/std": 0.3385297954082489, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1260.921875, + "completions/mean_terminated_length": 998.5625, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.09371428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27329322695732117, + "learning_rate": 7.681643291108517e-07, + "loss": -0.0, + "num_tokens": 9392548.0, + "reward": 0.9478914737701416, + "reward_std": 0.4313860237598419, + "rewards/cosine_scaled_reward/mean": 0.09894578158855438, + "rewards/cosine_scaled_reward/std": 0.5477120876312256, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4364357888698578, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1309.671875, + "completions/mean_terminated_length": 922.9285888671875, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.09485714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3202998638153076, + "learning_rate": 7.612622032536507e-07, + "loss": -0.0, + "num_tokens": 9487455.0, + "reward": 0.5201998949050903, + "reward_std": 0.6858996152877808, + "rewards/cosine_scaled_reward/mean": -0.09927503764629364, + "rewards/cosine_scaled_reward/std": 0.37909674644470215, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4531635046005249, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1685.0, + "completions/mean_length": 1185.703125, + "completions/mean_terminated_length": 965.9019775390625, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29646041989326477, + "learning_rate": 7.54295724882796e-07, + "loss": -0.0, + "num_tokens": 9574036.0, + "reward": 0.6779025793075562, + "reward_std": 0.557724118232727, + "rewards/cosine_scaled_reward/mean": -0.09073619544506073, + "rewards/cosine_scaled_reward/std": 0.3855368196964264, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1297.828125, + "completions/mean_terminated_length": 1158.907470703125, + "completions/min_length": 601.0, + "completions/min_terminated_length": 601.0, + "epoch": 0.09714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21307455003261566, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0, + "num_tokens": 9667417.0, + "reward": 0.5093189477920532, + "reward_std": 0.6006681323051453, + "rewards/cosine_scaled_reward/mean": -0.1672155261039734, + "rewards/cosine_scaled_reward/std": 0.34896284341812134, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1859.0, + "completions/mean_length": 1348.90625, + "completions/mean_terminated_length": 1096.04248046875, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "epoch": 0.09828571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2883393168449402, + "learning_rate": 7.401782177833147e-07, + "loss": -0.0, + "num_tokens": 9764603.0, + "reward": 0.8025823831558228, + "reward_std": 0.547119677066803, + "rewards/cosine_scaled_reward/mean": 0.01847870647907257, + "rewards/cosine_scaled_reward/std": 0.4346420168876648, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42695629596710205, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1782.0, + "completions/mean_length": 1086.96875, + "completions/mean_terminated_length": 909.0, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.09942857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31888866424560547, + "learning_rate": 7.330314893841101e-07, + "loss": -0.0, + "num_tokens": 9844289.0, + "reward": 0.5533354878425598, + "reward_std": 0.5319498777389526, + "rewards/cosine_scaled_reward/mean": -0.1530197560787201, + "rewards/cosine_scaled_reward/std": 0.2434682846069336, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 954.921875, + "completions/mean_terminated_length": 919.6612548828125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.10057142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3025936484336853, + "learning_rate": 7.258290078201731e-07, + "loss": -0.0, + "num_tokens": 9915916.0, + "reward": 1.2692296504974365, + "reward_std": 0.5115163326263428, + "rewards/cosine_scaled_reward/mean": 0.13461479544639587, + "rewards/cosine_scaled_reward/std": 0.506001353263855, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 1351.8125, + "completions/mean_terminated_length": 1174.35302734375, + "completions/min_length": 650.0, + "completions/min_terminated_length": 650.0, + "epoch": 0.10171428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23423585295677185, + "learning_rate": 7.185729670371604e-07, + "loss": -0.0, + "num_tokens": 10013432.0, + "reward": 0.724889874458313, + "reward_std": 0.7425336837768555, + "rewards/cosine_scaled_reward/mean": -0.0828675627708435, + "rewards/cosine_scaled_reward/std": 0.3893774449825287, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1906.0, + "completions/mean_length": 1153.28125, + "completions/mean_terminated_length": 1025.46435546875, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "epoch": 0.10285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3860023021697998, + "learning_rate": 7.11265577295385e-07, + "loss": -0.0, + "num_tokens": 10097242.0, + "reward": 0.5000253915786743, + "reward_std": 0.5103108286857605, + "rewards/cosine_scaled_reward/mean": -0.18748730421066284, + "rewards/cosine_scaled_reward/std": 0.2787182629108429, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1455.484375, + "completions/mean_terminated_length": 1166.1163330078125, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "epoch": 0.104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2551063895225525, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0, + "num_tokens": 10200961.0, + "reward": 0.4053259789943695, + "reward_std": 0.663999617099762, + "rewards/cosine_scaled_reward/mean": -0.18796202540397644, + "rewards/cosine_scaled_reward/std": 0.35777655243873596, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4166666865348816, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1176.953125, + "completions/mean_terminated_length": 1015.6481323242188, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "epoch": 0.10514285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27449366450309753, + "learning_rate": 6.965056695057204e-07, + "loss": -0.0, + "num_tokens": 10286278.0, + "reward": 0.5743436217308044, + "reward_std": 0.6229422092437744, + "rewards/cosine_scaled_reward/mean": -0.15032817423343658, + "rewards/cosine_scaled_reward/std": 0.2899566888809204, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1434.875, + "completions/mean_terminated_length": 1156.181884765625, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.10628571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2839376926422119, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0, + "num_tokens": 10389454.0, + "reward": 0.30658647418022156, + "reward_std": 0.5343226194381714, + "rewards/cosine_scaled_reward/mean": -0.22951926290988922, + "rewards/cosine_scaled_reward/std": 0.2324177473783493, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42695629596710205, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1684.0, + "completions/mean_length": 1242.390625, + "completions/mean_terminated_length": 927.1522216796875, + "completions/min_length": 508.0, + "completions/min_terminated_length": 508.0, + "epoch": 0.10742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2985072433948517, + "learning_rate": 6.815672671252315e-07, + "loss": 0.0, + "num_tokens": 10478735.0, + "reward": 0.6593698263168335, + "reward_std": 0.5845412015914917, + "rewards/cosine_scaled_reward/mean": -0.02969011664390564, + "rewards/cosine_scaled_reward/std": 0.47056320309638977, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4531635046005249, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1931.0, + "completions/mean_length": 1203.265625, + "completions/mean_terminated_length": 1082.58935546875, + "completions/min_length": 573.0, + "completions/min_terminated_length": 573.0, + "epoch": 0.10857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2689598798751831, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0, + "num_tokens": 10566272.0, + "reward": 0.4301251173019409, + "reward_std": 0.4795047640800476, + "rewards/cosine_scaled_reward/mean": -0.22243742644786835, + "rewards/cosine_scaled_reward/std": 0.2575407326221466, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1827.0, + "completions/mean_length": 1205.5625, + "completions/mean_terminated_length": 990.8235473632812, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.10971428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30502915382385254, + "learning_rate": 6.664685702961344e-07, + "loss": -0.0, + "num_tokens": 10654564.0, + "reward": 0.896080493927002, + "reward_std": 0.6987663507461548, + "rewards/cosine_scaled_reward/mean": 0.02616523765027523, + "rewards/cosine_scaled_reward/std": 0.460237056016922, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1777.0, + "completions/mean_length": 1170.390625, + "completions/mean_terminated_length": 988.2453002929688, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.11085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3103901743888855, + "learning_rate": 6.588648530198504e-07, + "loss": -0.0, + "num_tokens": 10739733.0, + "reward": 0.6633297204971313, + "reward_std": 0.609075665473938, + "rewards/cosine_scaled_reward/mean": -0.12927262485027313, + "rewards/cosine_scaled_reward/std": 0.4114542305469513, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1817.0, + "completions/mean_length": 1136.5625, + "completions/mean_terminated_length": 947.396240234375, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "epoch": 0.112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2510873079299927, + "learning_rate": 6.512279744547392e-07, + "loss": 0.0, + "num_tokens": 10823537.0, + "reward": 0.6613268256187439, + "reward_std": 0.4785424768924713, + "rewards/cosine_scaled_reward/mean": -0.09902409464120865, + "rewards/cosine_scaled_reward/std": 0.4345317482948303, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1171.8125, + "completions/mean_terminated_length": 1081.17236328125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.11314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.281054824590683, + "learning_rate": 6.435602608679916e-07, + "loss": -0.0, + "num_tokens": 10909701.0, + "reward": 1.0416245460510254, + "reward_std": 0.6949809789657593, + "rewards/cosine_scaled_reward/mean": 0.0520622618496418, + "rewards/cosine_scaled_reward/std": 0.508481502532959, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24397502839565277, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 1120.8125, + "completions/mean_terminated_length": 1024.8966064453125, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.11428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2910788655281067, + "learning_rate": 6.358640479194451e-07, + "loss": 0.0, + "num_tokens": 10991145.0, + "reward": 1.2036188840866089, + "reward_std": 0.8533884286880493, + "rewards/cosine_scaled_reward/mean": 0.14087192714214325, + "rewards/cosine_scaled_reward/std": 0.5375887751579285, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 1076.953125, + "completions/mean_terminated_length": 1029.1966552734375, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "epoch": 0.11542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33955609798431396, + "learning_rate": 6.281416799501187e-07, + "loss": 0.0, + "num_tokens": 11071502.0, + "reward": 0.7810705900192261, + "reward_std": 0.5973731279373169, + "rewards/cosine_scaled_reward/mean": -0.10165221989154816, + "rewards/cosine_scaled_reward/std": 0.4130260646343231, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 1092.078125, + "completions/mean_terminated_length": 935.654541015625, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.11657142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34537607431411743, + "learning_rate": 6.203955092681039e-07, + "loss": 0.0, + "num_tokens": 11151547.0, + "reward": 0.6441041231155396, + "reward_std": 0.53089839220047, + "rewards/cosine_scaled_reward/mean": -0.10763543844223022, + "rewards/cosine_scaled_reward/std": 0.39948928356170654, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1120.625, + "completions/mean_terminated_length": 1006.7368774414062, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.11771428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.343980997800827, + "learning_rate": 6.126278954320294e-07, + "loss": 0.0, + "num_tokens": 11233619.0, + "reward": 0.6925251483917236, + "reward_std": 0.5938367247581482, + "rewards/cosine_scaled_reward/mean": -0.13029994070529938, + "rewards/cosine_scaled_reward/std": 0.37749138474464417, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21304203569889069, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1913.0, + "completions/mean_length": 1120.359375, + "completions/mean_terminated_length": 948.5740966796875, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.11885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30854102969169617, + "learning_rate": 6.048412045323164e-07, + "loss": -0.0, + "num_tokens": 11315786.0, + "reward": 0.560060977935791, + "reward_std": 0.5216183662414551, + "rewards/cosine_scaled_reward/mean": -0.1418444812297821, + "rewards/cosine_scaled_reward/std": 0.33836889266967773, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1897.0, + "completions/mean_length": 1158.421875, + "completions/mean_terminated_length": 953.1346435546875, + "completions/min_length": 503.0, + "completions/min_terminated_length": 503.0, + "epoch": 0.12, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29531243443489075, + "learning_rate": 5.97037808470444e-07, + "loss": -0.0, + "num_tokens": 11401213.0, + "reward": 1.0410652160644531, + "reward_std": 0.7858219742774963, + "rewards/cosine_scaled_reward/mean": 0.09084508568048477, + "rewards/cosine_scaled_reward/std": 0.5061684250831604, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1867.0, + "completions/mean_length": 1045.859375, + "completions/mean_terminated_length": 837.867919921875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.12114285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26259294152259827, + "learning_rate": 5.892200842364462e-07, + "loss": -0.0, + "num_tokens": 11478980.0, + "reward": 1.0545225143432617, + "reward_std": 0.7633667588233948, + "rewards/cosine_scaled_reward/mean": 0.07413630187511444, + "rewards/cosine_scaled_reward/std": 0.48842984437942505, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1101.234375, + "completions/mean_terminated_length": 946.30908203125, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.12228571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3363504409790039, + "learning_rate": 5.813904131848564e-07, + "loss": 0.0, + "num_tokens": 11560611.0, + "reward": 0.648673415184021, + "reward_std": 0.6051540970802307, + "rewards/cosine_scaled_reward/mean": -0.11316327750682831, + "rewards/cosine_scaled_reward/std": 0.37149766087532043, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1857.0, + "completions/mean_length": 1225.28125, + "completions/mean_terminated_length": 1054.5283203125, + "completions/min_length": 515.0, + "completions/min_terminated_length": 515.0, + "epoch": 0.12342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2867675721645355, + "learning_rate": 5.735511803093248e-07, + "loss": 0.0, + "num_tokens": 11649389.0, + "reward": 0.560509204864502, + "reward_std": 0.6691359877586365, + "rewards/cosine_scaled_reward/mean": -0.14943289756774902, + "rewards/cosine_scaled_reward/std": 0.4461749494075775, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1227.203125, + "completions/mean_terminated_length": 1056.84912109375, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "epoch": 0.12457142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2772690951824188, + "learning_rate": 5.657047735161255e-07, + "loss": -0.0, + "num_tokens": 11739178.0, + "reward": 0.6980891227722168, + "reward_std": 0.624833345413208, + "rewards/cosine_scaled_reward/mean": -0.0650179386138916, + "rewards/cosine_scaled_reward/std": 0.41062912344932556, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38025420904159546, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1727.0, + "completions/mean_length": 1145.0, + "completions/mean_terminated_length": 914.8235473632812, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.12571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3468596637248993, + "learning_rate": 5.578535828967777e-07, + "loss": -0.0, + "num_tokens": 11823234.0, + "reward": 0.6972323656082153, + "reward_std": 0.5477026104927063, + "rewards/cosine_scaled_reward/mean": -0.08888379484415054, + "rewards/cosine_scaled_reward/std": 0.3565239906311035, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1969.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 977.046875, + "completions/mean_terminated_length": 977.046875, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.12685714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3180137574672699, + "learning_rate": 5.5e-07, + "loss": 0.0, + "num_tokens": 11895885.0, + "reward": 0.8744360208511353, + "reward_std": 0.5815237164497375, + "rewards/cosine_scaled_reward/mean": -0.06278196722269058, + "rewards/cosine_scaled_reward/std": 0.37791064381599426, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1954.0, + "completions/mean_length": 1269.421875, + "completions/mean_terminated_length": 1089.75, + "completions/min_length": 605.0, + "completions/min_terminated_length": 605.0, + "epoch": 0.128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2817465364933014, + "learning_rate": 5.421464171032224e-07, + "loss": -0.0, + "num_tokens": 11988224.0, + "reward": 0.9151681065559387, + "reward_std": 0.594943642616272, + "rewards/cosine_scaled_reward/mean": 0.02789657562971115, + "rewards/cosine_scaled_reward/std": 0.4965399205684662, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1910.0, + "completions/max_terminated_length": 1910.0, + "completions/mean_length": 934.578125, + "completions/mean_terminated_length": 934.578125, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.12914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3341560959815979, + "learning_rate": 5.342952264838747e-07, + "loss": -0.0, + "num_tokens": 12058333.0, + "reward": 1.0256879329681396, + "reward_std": 0.717230498790741, + "rewards/cosine_scaled_reward/mean": 0.02065650373697281, + "rewards/cosine_scaled_reward/std": 0.4963410794734955, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 1055.21875, + "completions/mean_terminated_length": 971.0847778320312, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.13028571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3800676763057709, + "learning_rate": 5.264488196906752e-07, + "loss": -0.0, + "num_tokens": 12135715.0, + "reward": 0.649993896484375, + "reward_std": 0.5865596532821655, + "rewards/cosine_scaled_reward/mean": -0.1750030517578125, + "rewards/cosine_scaled_reward/std": 0.3388007879257202, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 1169.671875, + "completions/mean_terminated_length": 987.3773803710938, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.13142857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3112519085407257, + "learning_rate": 5.186095868151436e-07, + "loss": 0.0, + "num_tokens": 12221790.0, + "reward": 0.7184536457061768, + "reward_std": 0.44992831349372864, + "rewards/cosine_scaled_reward/mean": -0.06264819949865341, + "rewards/cosine_scaled_reward/std": 0.44565486907958984, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1936.0, + "completions/mean_length": 1224.890625, + "completions/mean_terminated_length": 1072.4630126953125, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "epoch": 0.13257142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2884223461151123, + "learning_rate": 5.107799157635538e-07, + "loss": 0.0, + "num_tokens": 12311567.0, + "reward": 0.8372049927711487, + "reward_std": 0.608986496925354, + "rewards/cosine_scaled_reward/mean": -0.026710007339715958, + "rewards/cosine_scaled_reward/std": 0.4437602162361145, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 1078.65625, + "completions/mean_terminated_length": 1030.9835205078125, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "epoch": 0.1337142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3016076385974884, + "learning_rate": 5.02962191529556e-07, + "loss": -0.0, + "num_tokens": 12391625.0, + "reward": 0.8182538747787476, + "reward_std": 0.6463132500648499, + "rewards/cosine_scaled_reward/mean": -0.09087307006120682, + "rewards/cosine_scaled_reward/std": 0.3895137310028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 1226.046875, + "completions/mean_terminated_length": 952.0625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.13485714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2991194427013397, + "learning_rate": 4.951587954676837e-07, + "loss": 0.0, + "num_tokens": 12480628.0, + "reward": 0.6370267868041992, + "reward_std": 0.7525250911712646, + "rewards/cosine_scaled_reward/mean": -0.056486621499061584, + "rewards/cosine_scaled_reward/std": 0.44576171040534973, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4364357888698578, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1038.96875, + "completions/mean_terminated_length": 894.8214721679688, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4483291506767273, + "learning_rate": 4.873721045679706e-07, + "loss": 0.0, + "num_tokens": 12557530.0, + "reward": 0.9855979084968567, + "reward_std": 0.6055079698562622, + "rewards/cosine_scaled_reward/mean": 0.04748644679784775, + "rewards/cosine_scaled_reward/std": 0.47108832001686096, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 856.578125, + "completions/mean_terminated_length": 818.1451416015625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.13714285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3406151831150055, + "learning_rate": 4.79604490731896e-07, + "loss": -0.0, + "num_tokens": 12622807.0, + "reward": 0.7979192733764648, + "reward_std": 0.6180044412612915, + "rewards/cosine_scaled_reward/mean": -0.10104038566350937, + "rewards/cosine_scaled_reward/std": 0.44317325949668884, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1764.0, + "completions/mean_length": 726.34375, + "completions/mean_terminated_length": 683.7096557617188, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.1382857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4178949296474457, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.0, + "num_tokens": 12678989.0, + "reward": 1.161607265472412, + "reward_std": 0.6393733024597168, + "rewards/cosine_scaled_reward/mean": 0.08080361783504486, + "rewards/cosine_scaled_reward/std": 0.5313310027122498, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1133.796875, + "completions/mean_terminated_length": 1039.22412109375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.13942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3333284258842468, + "learning_rate": 4.641359520805548e-07, + "loss": 0.0, + "num_tokens": 12763112.0, + "reward": 0.9356573820114136, + "reward_std": 0.6247758269309998, + "rewards/cosine_scaled_reward/mean": -0.02435879409313202, + "rewards/cosine_scaled_reward/std": 0.4759780466556549, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1031.296875, + "completions/mean_terminated_length": 981.2950439453125, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "epoch": 0.14057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29939791560173035, + "learning_rate": 4.5643973913200837e-07, + "loss": -0.0, + "num_tokens": 12839347.0, + "reward": 0.7725162506103516, + "reward_std": 0.5560778379440308, + "rewards/cosine_scaled_reward/mean": -0.09811685979366302, + "rewards/cosine_scaled_reward/std": 0.3822804391384125, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 979.234375, + "completions/mean_terminated_length": 944.758056640625, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.1417142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34992095828056335, + "learning_rate": 4.4877202554526084e-07, + "loss": 0.0, + "num_tokens": 12912970.0, + "reward": 1.085427165031433, + "reward_std": 0.6837464570999146, + "rewards/cosine_scaled_reward/mean": 0.05052608996629715, + "rewards/cosine_scaled_reward/std": 0.4791998267173767, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1076.40625, + "completions/mean_terminated_length": 994.0678100585938, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.14285714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27060386538505554, + "learning_rate": 4.4113514698014953e-07, + "loss": -0.0, + "num_tokens": 12992788.0, + "reward": 1.0397578477859497, + "reward_std": 0.43823006749153137, + "rewards/cosine_scaled_reward/mean": 0.019878946244716644, + "rewards/cosine_scaled_reward/std": 0.46214956045150757, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1856.0, + "completions/mean_length": 1071.53125, + "completions/mean_terminated_length": 1006.4334106445312, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "epoch": 0.144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2776121497154236, + "learning_rate": 4.3353142970386557e-07, + "loss": 0.0, + "num_tokens": 13072662.0, + "reward": 1.0028693675994873, + "reward_std": 0.6879971027374268, + "rewards/cosine_scaled_reward/mean": 0.0014346465468406677, + "rewards/cosine_scaled_reward/std": 0.42488595843315125, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1868.0, + "completions/mean_length": 1180.484375, + "completions/mean_terminated_length": 1056.5535888671875, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.14514285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2829054594039917, + "learning_rate": 4.2596318988235037e-07, + "loss": -0.0, + "num_tokens": 13159309.0, + "reward": 0.6576684713363647, + "reward_std": 0.66895592212677, + "rewards/cosine_scaled_reward/mean": -0.15554077923297882, + "rewards/cosine_scaled_reward/std": 0.3959099054336548, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1869.0, + "completions/mean_length": 1053.328125, + "completions/mean_terminated_length": 950.4310302734375, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.1462857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29738253355026245, + "learning_rate": 4.1843273287476854e-07, + "loss": -0.0, + "num_tokens": 13237074.0, + "reward": 0.8851851224899292, + "reward_std": 0.7390589118003845, + "rewards/cosine_scaled_reward/mean": -0.041782446205616, + "rewards/cosine_scaled_reward/std": 0.46901625394821167, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 1228.484375, + "completions/mean_terminated_length": 1111.4107666015625, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.14742857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25943535566329956, + "learning_rate": 4.1094235253127374e-07, + "loss": -0.0, + "num_tokens": 13326401.0, + "reward": 0.9628820419311523, + "reward_std": 0.6490253210067749, + "rewards/cosine_scaled_reward/mean": 0.004878522828221321, + "rewards/cosine_scaled_reward/std": 0.45456331968307495, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21304203569889069, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 1089.578125, + "completions/mean_terminated_length": 952.6607666015625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.14857142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3009719252586365, + "learning_rate": 4.034943304942796e-07, + "loss": 0.0, + "num_tokens": 13406638.0, + "reward": 0.5984547138214111, + "reward_std": 0.7008002996444702, + "rewards/cosine_scaled_reward/mean": -0.14608514308929443, + "rewards/cosine_scaled_reward/std": 0.37894922494888306, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1651.0, + "completions/mean_length": 1058.03125, + "completions/mean_terminated_length": 916.607177734375, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.14971428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.306725412607193, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0, + "num_tokens": 13484088.0, + "reward": 1.0469268560409546, + "reward_std": 0.6023457050323486, + "rewards/cosine_scaled_reward/mean": 0.0703384131193161, + "rewards/cosine_scaled_reward/std": 0.47298464179039, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1773.0, + "completions/mean_length": 1342.78125, + "completions/mean_terminated_length": 919.6500244140625, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.15085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3032574951648712, + "learning_rate": 3.8873442270461485e-07, + "loss": -0.0, + "num_tokens": 13581090.0, + "reward": 0.4643245339393616, + "reward_std": 0.7533800601959229, + "rewards/cosine_scaled_reward/mean": -0.06471271812915802, + "rewards/cosine_scaled_reward/std": 0.4610835611820221, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.49501484632492065, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 1144.921875, + "completions/mean_terminated_length": 957.4906005859375, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "epoch": 0.152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32285141944885254, + "learning_rate": 3.8142703296283953e-07, + "loss": 0.0, + "num_tokens": 13665589.0, + "reward": 0.5014957189559937, + "reward_std": 0.5352932214736938, + "rewards/cosine_scaled_reward/mean": -0.17112717032432556, + "rewards/cosine_scaled_reward/std": 0.28127768635749817, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 975.53125, + "completions/mean_terminated_length": 958.5079956054688, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.15314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.40716752409935, + "learning_rate": 3.7417099217982686e-07, + "loss": -0.0, + "num_tokens": 13738591.0, + "reward": 1.1759617328643799, + "reward_std": 0.4804629683494568, + "rewards/cosine_scaled_reward/mean": 0.08798093348741531, + "rewards/cosine_scaled_reward/std": 0.5343761444091797, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1686.0, + "completions/max_terminated_length": 1686.0, + "completions/mean_length": 758.515625, + "completions/mean_terminated_length": 758.515625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.15428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42696353793144226, + "learning_rate": 3.6696851061588994e-07, + "loss": -0.0, + "num_tokens": 13797608.0, + "reward": 1.3851683139801025, + "reward_std": 0.5234883427619934, + "rewards/cosine_scaled_reward/mean": 0.19258417189121246, + "rewards/cosine_scaled_reward/std": 0.49346473813056946, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1169.875, + "completions/mean_terminated_length": 1095.4576416015625, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.15542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28027620911598206, + "learning_rate": 3.5982178221668533e-07, + "loss": -0.0, + "num_tokens": 13883152.0, + "reward": 1.0174503326416016, + "reward_std": 0.5889347791671753, + "rewards/cosine_scaled_reward/mean": 0.016537662595510483, + "rewards/cosine_scaled_reward/std": 0.4763922095298767, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1814.0, + "completions/mean_length": 1105.3125, + "completions/mean_terminated_length": 1042.4666748046875, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "epoch": 0.15657142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3002299666404724, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0, + "num_tokens": 13964500.0, + "reward": 0.841381847858429, + "reward_std": 0.6354345083236694, + "rewards/cosine_scaled_reward/mean": -0.07149658352136612, + "rewards/cosine_scaled_reward/std": 0.4138363003730774, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 1125.484375, + "completions/mean_terminated_length": 974.5272216796875, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.15771428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28766506910324097, + "learning_rate": 3.45704275117204e-07, + "loss": -0.0, + "num_tokens": 14047843.0, + "reward": 0.8758631944656372, + "reward_std": 0.7212573289871216, + "rewards/cosine_scaled_reward/mean": -0.05425591766834259, + "rewards/cosine_scaled_reward/std": 0.4783853590488434, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1216.171875, + "completions/mean_terminated_length": 1160.7166748046875, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.15885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2882857024669647, + "learning_rate": 3.387377967463493e-07, + "loss": -0.0, + "num_tokens": 14136318.0, + "reward": 0.7189284563064575, + "reward_std": 0.4593912959098816, + "rewards/cosine_scaled_reward/mean": -0.13272328674793243, + "rewards/cosine_scaled_reward/std": 0.33584704995155334, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1142.140625, + "completions/mean_terminated_length": 1012.732177734375, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.16, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.3000667095184326, + "learning_rate": 3.3183567088914833e-07, + "loss": 0.0, + "num_tokens": 14219639.0, + "reward": 0.8278639316558838, + "reward_std": 0.46724599599838257, + "rewards/cosine_scaled_reward/mean": -0.03919300064444542, + "rewards/cosine_scaled_reward/std": 0.4650508463382721, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1919.0, + "completions/mean_length": 1025.421875, + "completions/mean_terminated_length": 975.131103515625, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.16114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3207882046699524, + "learning_rate": 3.250000000000001e-07, + "loss": 0.0, + "num_tokens": 14295826.0, + "reward": 0.8871637582778931, + "reward_std": 0.6538586616516113, + "rewards/cosine_scaled_reward/mean": -0.04079316183924675, + "rewards/cosine_scaled_reward/std": 0.43451616168022156, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1233.90625, + "completions/mean_terminated_length": 1149.689697265625, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.16228571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3009903132915497, + "learning_rate": 3.182328662904756e-07, + "loss": 0.0, + "num_tokens": 14385300.0, + "reward": 0.8573208451271057, + "reward_std": 0.6099269390106201, + "rewards/cosine_scaled_reward/mean": -0.055714573711156845, + "rewards/cosine_scaled_reward/std": 0.43728360533714294, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 1136.078125, + "completions/mean_terminated_length": 1005.8035888671875, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.16342857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31794917583465576, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0, + "num_tokens": 14468825.0, + "reward": 0.6553314924240112, + "reward_std": 0.6344339847564697, + "rewards/cosine_scaled_reward/mean": -0.11764675378799438, + "rewards/cosine_scaled_reward/std": 0.3099633455276489, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 1220.6875, + "completions/mean_terminated_length": 1029.769287109375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.16457142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3814108967781067, + "learning_rate": 3.0491243424323783e-07, + "loss": 0.0, + "num_tokens": 14558437.0, + "reward": 0.7285318970680237, + "reward_std": 0.8925961256027222, + "rewards/cosine_scaled_reward/mean": -0.05760904401540756, + "rewards/cosine_scaled_reward/std": 0.492266446352005, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36596253514289856, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1880.0, + "completions/mean_length": 969.796875, + "completions/mean_terminated_length": 916.7704467773438, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.1657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3201180398464203, + "learning_rate": 2.9836319343816397e-07, + "loss": -0.0, + "num_tokens": 14630448.0, + "reward": 0.8149441480636597, + "reward_std": 0.5824600458145142, + "rewards/cosine_scaled_reward/mean": -0.08471541851758957, + "rewards/cosine_scaled_reward/std": 0.475755512714386, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1943.0, + "completions/mean_length": 1034.484375, + "completions/mean_terminated_length": 966.9166870117188, + "completions/min_length": 482.0, + "completions/min_terminated_length": 482.0, + "epoch": 0.16685714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28184273838996887, + "learning_rate": 2.918906036420294e-07, + "loss": -0.0, + "num_tokens": 14707271.0, + "reward": 0.8387603759765625, + "reward_std": 0.5346506237983704, + "rewards/cosine_scaled_reward/mean": -0.07280732691287994, + "rewards/cosine_scaled_reward/std": 0.43024110794067383, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 1249.984375, + "completions/mean_terminated_length": 1046.568603515625, + "completions/min_length": 550.0, + "completions/min_terminated_length": 550.0, + "epoch": 0.168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32145801186561584, + "learning_rate": 2.854966364683872e-07, + "loss": 0.0, + "num_tokens": 14798054.0, + "reward": 0.7505484819412231, + "reward_std": 0.5473448634147644, + "rewards/cosine_scaled_reward/mean": -0.07003828883171082, + "rewards/cosine_scaled_reward/std": 0.4046306014060974, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3145764470100403, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1844.0, + "completions/mean_length": 1062.828125, + "completions/mean_terminated_length": 960.913818359375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.16914285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2667451500892639, + "learning_rate": 2.791832395815782e-07, + "loss": -0.0, + "num_tokens": 14877259.0, + "reward": 0.7823130488395691, + "reward_std": 0.48230016231536865, + "rewards/cosine_scaled_reward/mean": -0.06978099048137665, + "rewards/cosine_scaled_reward/std": 0.37567150592803955, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1386.875, + "completions/mean_terminated_length": 1086.3636474609375, + "completions/min_length": 439.0, + "completions/min_terminated_length": 439.0, + "epoch": 0.1702857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2730913758277893, + "learning_rate": 2.729523361034538e-07, + "loss": 0.0, + "num_tokens": 14977915.0, + "reward": 0.48214927315711975, + "reward_std": 0.8376681804656982, + "rewards/cosine_scaled_reward/mean": -0.14173786342144012, + "rewards/cosine_scaled_reward/std": 0.4272434711456299, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42695629596710205, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1831.0, + "completions/mean_length": 994.15625, + "completions/mean_terminated_length": 942.3278198242188, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.17142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2946690022945404, + "learning_rate": 2.6680582402757324e-07, + "loss": -0.0, + "num_tokens": 15052045.0, + "reward": 0.8893749713897705, + "reward_std": 0.7130615711212158, + "rewards/cosine_scaled_reward/mean": -0.05531252920627594, + "rewards/cosine_scaled_reward/std": 0.4389563202857971, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1094.4375, + "completions/mean_terminated_length": 917.8518676757812, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.17257142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29988256096839905, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0, + "num_tokens": 15132769.0, + "reward": 1.088501214981079, + "reward_std": 0.9213382005691528, + "rewards/cosine_scaled_reward/mean": 0.10675054788589478, + "rewards/cosine_scaled_reward/std": 0.510394811630249, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 1024.203125, + "completions/mean_terminated_length": 937.440673828125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.1737142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46614158153533936, + "learning_rate": 2.547734369542718e-07, + "loss": -0.0, + "num_tokens": 15208982.0, + "reward": 0.7280048131942749, + "reward_std": 0.706195592880249, + "rewards/cosine_scaled_reward/mean": -0.10474759340286255, + "rewards/cosine_scaled_reward/std": 0.45987388491630554, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24397502839565277, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1955.0, + "completions/mean_length": 1180.234375, + "completions/mean_terminated_length": 1056.2679443359375, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.17485714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33488133549690247, + "learning_rate": 2.488912271385139e-07, + "loss": -0.0, + "num_tokens": 15295661.0, + "reward": 0.4985957443714142, + "reward_std": 0.4677598178386688, + "rewards/cosine_scaled_reward/mean": -0.2272646427154541, + "rewards/cosine_scaled_reward/std": 0.2307518571615219, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21304203569889069, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1340.296875, + "completions/mean_terminated_length": 1142.1400146484375, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "epoch": 0.176, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25304633378982544, + "learning_rate": 2.4310073797187573e-07, + "loss": -0.0, + "num_tokens": 15392504.0, + "reward": 0.7636169195175171, + "reward_std": 0.7114115953445435, + "rewards/cosine_scaled_reward/mean": -0.03225403279066086, + "rewards/cosine_scaled_reward/std": 0.42686402797698975, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38025420904159546, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1909.0, + "completions/mean_length": 915.6875, + "completions/mean_terminated_length": 798.5516967773438, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.17714285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35448068380355835, + "learning_rate": 2.374037332934512e-07, + "loss": 0.0, + "num_tokens": 15461732.0, + "reward": 0.736025333404541, + "reward_std": 0.5466883182525635, + "rewards/cosine_scaled_reward/mean": -0.11636234819889069, + "rewards/cosine_scaled_reward/std": 0.43356192111968994, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1913.0, + "completions/mean_length": 1157.90625, + "completions/mean_terminated_length": 952.5000610351562, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.1782857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4357910454273224, + "learning_rate": 2.3180194846605364e-07, + "loss": -0.0, + "num_tokens": 15545942.0, + "reward": 0.8330824971199036, + "reward_std": 0.725536048412323, + "rewards/cosine_scaled_reward/mean": -0.02095877379179001, + "rewards/cosine_scaled_reward/std": 0.4767586290836334, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1157.75, + "completions/mean_terminated_length": 1030.571533203125, + "completions/min_length": 485.0, + "completions/min_terminated_length": 485.0, + "epoch": 0.17942857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29891225695610046, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.0, + "num_tokens": 15629998.0, + "reward": 0.6674755811691284, + "reward_std": 0.6577311754226685, + "rewards/cosine_scaled_reward/mean": -0.13501222431659698, + "rewards/cosine_scaled_reward/std": 0.36102381348609924, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24397502839565277, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1856.0, + "completions/mean_length": 1013.6875, + "completions/mean_terminated_length": 962.8196411132812, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.18057142857142858, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.2723560333251953, + "learning_rate": 2.2089083427137329e-07, + "loss": 0.0, + "num_tokens": 15704994.0, + "reward": 0.9709224104881287, + "reward_std": 0.48810505867004395, + "rewards/cosine_scaled_reward/mean": -0.014538809657096863, + "rewards/cosine_scaled_reward/std": 0.4970093369483948, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1881.0, + "completions/mean_length": 1081.296875, + "completions/mean_terminated_length": 962.5789794921875, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "epoch": 0.18171428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2894439697265625, + "learning_rate": 2.1558482853517253e-07, + "loss": -0.0, + "num_tokens": 15785877.0, + "reward": 0.5938807725906372, + "reward_std": 0.592242956161499, + "rewards/cosine_scaled_reward/mean": -0.16399714350700378, + "rewards/cosine_scaled_reward/std": 0.3423241078853607, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1750.0, + "completions/mean_length": 968.25, + "completions/mean_terminated_length": 915.1474609375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.18285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3261898159980774, + "learning_rate": 2.1038068889975259e-07, + "loss": 0.0, + "num_tokens": 15859429.0, + "reward": 1.2050117254257202, + "reward_std": 0.6944217681884766, + "rewards/cosine_scaled_reward/mean": 0.10250584781169891, + "rewards/cosine_scaled_reward/std": 0.5283173322677612, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1031.75, + "completions/mean_terminated_length": 945.6271362304688, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34274861216545105, + "learning_rate": 2.0528000059645995e-07, + "loss": -0.0, + "num_tokens": 15935453.0, + "reward": 0.9563960433006287, + "reward_std": 0.6316370964050293, + "rewards/cosine_scaled_reward/mean": 0.009448029100894928, + "rewards/cosine_scaled_reward/std": 0.46292582154273987, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24397502839565277, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1891.0, + "completions/mean_length": 1167.828125, + "completions/mean_terminated_length": 898.3877563476562, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "epoch": 0.18514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3887297511100769, + "learning_rate": 2.0028431734436308e-07, + "loss": 0.0, + "num_tokens": 16020498.0, + "reward": 0.6932262182235718, + "reward_std": 0.8278101682662964, + "rewards/cosine_scaled_reward/mean": -0.08307439833879471, + "rewards/cosine_scaled_reward/std": 0.3847581744194031, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1820.0, + "completions/mean_length": 1058.84375, + "completions/mean_terminated_length": 956.5172119140625, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.18628571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30917680263519287, + "learning_rate": 1.9539516087697517e-07, + "loss": 0.0, + "num_tokens": 16099448.0, + "reward": 1.3529155254364014, + "reward_std": 0.8906396627426147, + "rewards/cosine_scaled_reward/mean": 0.22333277761936188, + "rewards/cosine_scaled_reward/std": 0.5322388410568237, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 988.703125, + "completions/mean_terminated_length": 918.0833740234375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.18742857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33646658062934875, + "learning_rate": 1.9061402047871833e-07, + "loss": 0.0, + "num_tokens": 16173253.0, + "reward": 1.046778678894043, + "reward_std": 0.6892427206039429, + "rewards/cosine_scaled_reward/mean": 0.0390143096446991, + "rewards/cosine_scaled_reward/std": 0.4476637840270996, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1103.5, + "completions/mean_terminated_length": 948.9454345703125, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.18857142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.338925838470459, + "learning_rate": 1.8594235253127372e-07, + "loss": -0.0, + "num_tokens": 16255293.0, + "reward": 0.7887892723083496, + "reward_std": 0.6329070329666138, + "rewards/cosine_scaled_reward/mean": -0.0665428563952446, + "rewards/cosine_scaled_reward/std": 0.4880979061126709, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1714.0, + "completions/mean_length": 1166.265625, + "completions/mean_terminated_length": 1002.9815063476562, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "epoch": 0.18971428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29118841886520386, + "learning_rate": 1.8138158006995363e-07, + "loss": -0.0, + "num_tokens": 16341510.0, + "reward": 0.5021259784698486, + "reward_std": 0.5949545502662659, + "rewards/cosine_scaled_reward/mean": -0.18643701076507568, + "rewards/cosine_scaled_reward/std": 0.3388413190841675, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1756.0, + "completions/mean_length": 1027.96875, + "completions/mean_terminated_length": 922.4482421875, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.19085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3123703598976135, + "learning_rate": 1.7693309235023127e-07, + "loss": -0.0, + "num_tokens": 16418844.0, + "reward": 0.6054480671882629, + "reward_std": 0.6668864488601685, + "rewards/cosine_scaled_reward/mean": -0.17383846640586853, + "rewards/cosine_scaled_reward/std": 0.34976449608802795, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21304203569889069, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1131.890625, + "completions/mean_terminated_length": 1086.8360595703125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2515013515949249, + "learning_rate": 1.7259824442455923e-07, + "loss": 0.0, + "num_tokens": 16502125.0, + "reward": 0.929424524307251, + "reward_std": 0.6242066621780396, + "rewards/cosine_scaled_reward/mean": -0.011850237846374512, + "rewards/cosine_scaled_reward/std": 0.4718935191631317, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21304203569889069, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1737.0, + "completions/mean_length": 908.53125, + "completions/mean_terminated_length": 871.774169921875, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "epoch": 0.19314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29841023683547974, + "learning_rate": 1.6837835672960831e-07, + "loss": -0.0, + "num_tokens": 16570895.0, + "reward": 1.6184587478637695, + "reward_std": 0.5710533857345581, + "rewards/cosine_scaled_reward/mean": 0.3092293441295624, + "rewards/cosine_scaled_reward/std": 0.5226604342460632, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1005.109375, + "completions/mean_terminated_length": 834.4545288085938, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.19428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3179849088191986, + "learning_rate": 1.6427471468404952e-07, + "loss": -0.0, + "num_tokens": 16645006.0, + "reward": 1.0071099996566772, + "reward_std": 0.3746073246002197, + "rewards/cosine_scaled_reward/mean": 0.06605499982833862, + "rewards/cosine_scaled_reward/std": 0.4378518760204315, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1891.0, + "completions/mean_length": 1234.65625, + "completions/mean_terminated_length": 940.4680786132812, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "epoch": 0.19542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2992324233055115, + "learning_rate": 1.6028856829700258e-07, + "loss": -0.0, + "num_tokens": 16734416.0, + "reward": 0.7108581066131592, + "reward_std": 0.7254206538200378, + "rewards/cosine_scaled_reward/mean": -0.02738344669342041, + "rewards/cosine_scaled_reward/std": 0.44080549478530884, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42695629596710205, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1638.0, + "completions/mean_length": 900.234375, + "completions/mean_terminated_length": 823.7167358398438, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.19657142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47149336338043213, + "learning_rate": 1.5642113178727193e-07, + "loss": 0.0, + "num_tokens": 16802647.0, + "reward": 1.3995718955993652, + "reward_std": 0.5902794599533081, + "rewards/cosine_scaled_reward/mean": 0.2310360074043274, + "rewards/cosine_scaled_reward/std": 0.5026565194129944, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24397502839565277, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 925.078125, + "completions/mean_terminated_length": 787.1754150390625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.1977142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3893924057483673, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.0, + "num_tokens": 16873164.0, + "reward": 0.6720038056373596, + "reward_std": 0.667186975479126, + "rewards/cosine_scaled_reward/mean": -0.12493559718132019, + "rewards/cosine_scaled_reward/std": 0.40216636657714844, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1005.578125, + "completions/mean_terminated_length": 971.9515991210938, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.19885714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39529484510421753, + "learning_rate": 1.4904706411523448e-07, + "loss": -0.0, + "num_tokens": 16947857.0, + "reward": 0.9172019958496094, + "reward_std": 0.6198633313179016, + "rewards/cosine_scaled_reward/mean": -0.03358650952577591, + "rewards/cosine_scaled_reward/std": 0.4403606951236725, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1828.0, + "completions/mean_length": 952.296875, + "completions/mean_terminated_length": 898.4097900390625, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.2, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.322712779045105, + "learning_rate": 1.4554267916537495e-07, + "loss": 0.0, + "num_tokens": 17019628.0, + "reward": 0.871549129486084, + "reward_std": 0.46009254455566406, + "rewards/cosine_scaled_reward/mean": -0.05641293525695801, + "rewards/cosine_scaled_reward/std": 0.44415631890296936, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1048.453125, + "completions/mean_terminated_length": 945.0516967773438, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.20114285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3287680447101593, + "learning_rate": 1.4216149583350755e-07, + "loss": -0.0, + "num_tokens": 17097897.0, + "reward": 0.839117705821991, + "reward_std": 0.7753168344497681, + "rewards/cosine_scaled_reward/mean": -0.04137861356139183, + "rewards/cosine_scaled_reward/std": 0.43453913927078247, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1915.0, + "completions/mean_length": 968.34375, + "completions/mean_terminated_length": 933.51611328125, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "epoch": 0.2022857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3266870677471161, + "learning_rate": 1.3890454406082956e-07, + "loss": -0.0, + "num_tokens": 17170095.0, + "reward": 1.0329997539520264, + "reward_std": 0.7290528416633606, + "rewards/cosine_scaled_reward/mean": 0.024312350898981094, + "rewards/cosine_scaled_reward/std": 0.46764034032821655, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1016.0625, + "completions/mean_terminated_length": 909.3103637695312, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.20342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.330020546913147, + "learning_rate": 1.3577281594640182e-07, + "loss": -0.0, + "num_tokens": 17246659.0, + "reward": 1.1118203401565552, + "reward_std": 0.7913287878036499, + "rewards/cosine_scaled_reward/mean": 0.07934767752885818, + "rewards/cosine_scaled_reward/std": 0.5148099660873413, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21304203569889069, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1945.0, + "completions/mean_length": 1227.78125, + "completions/mean_terminated_length": 976.69384765625, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.20457142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33415722846984863, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.0, + "num_tokens": 17336069.0, + "reward": 0.608305037021637, + "reward_std": 0.5569274425506592, + "rewards/cosine_scaled_reward/mean": -0.10991000384092331, + "rewards/cosine_scaled_reward/std": 0.3418741822242737, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38025420904159546, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1932.0, + "completions/mean_length": 1024.46875, + "completions/mean_terminated_length": 956.2333984375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.2057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3402194082736969, + "learning_rate": 1.2988880807625927e-07, + "loss": -0.0, + "num_tokens": 17412811.0, + "reward": 1.6137604713439941, + "reward_std": 0.8008866310119629, + "rewards/cosine_scaled_reward/mean": 0.31469273567199707, + "rewards/cosine_scaled_reward/std": 0.5089212656021118, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1157.546875, + "completions/mean_terminated_length": 992.6481323242188, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "epoch": 0.20685714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29356250166893005, + "learning_rate": 1.2713832064634125e-07, + "loss": -0.0, + "num_tokens": 17498366.0, + "reward": 0.7507010698318481, + "reward_std": 0.5088521242141724, + "rewards/cosine_scaled_reward/mean": -0.07777446508407593, + "rewards/cosine_scaled_reward/std": 0.4100310504436493, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1840.0, + "completions/mean_length": 1166.390625, + "completions/mean_terminated_length": 896.5101928710938, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.208, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2653217613697052, + "learning_rate": 1.2451664098030743e-07, + "loss": -0.0, + "num_tokens": 17582807.0, + "reward": 0.7447050213813782, + "reward_std": 0.8267481327056885, + "rewards/cosine_scaled_reward/mean": -0.04170997440814972, + "rewards/cosine_scaled_reward/std": 0.4390917420387268, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38025420904159546, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1843.0, + "completions/mean_length": 1003.203125, + "completions/mean_terminated_length": 933.550048828125, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.20914285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3268946707248688, + "learning_rate": 1.220245676671809e-07, + "loss": 0.0, + "num_tokens": 17657628.0, + "reward": 1.0635898113250732, + "reward_std": 0.5967966914176941, + "rewards/cosine_scaled_reward/mean": 0.039607420563697815, + "rewards/cosine_scaled_reward/std": 0.43730178475379944, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1941.0, + "completions/mean_length": 1025.171875, + "completions/mean_terminated_length": 938.4915161132812, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.2102857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36050307750701904, + "learning_rate": 1.1966285981663407e-07, + "loss": 0.0, + "num_tokens": 17734591.0, + "reward": 0.6448719501495361, + "reward_std": 0.503462553024292, + "rewards/cosine_scaled_reward/mean": -0.14631402492523193, + "rewards/cosine_scaled_reward/std": 0.3733954429626465, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24397502839565277, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1779.0, + "completions/mean_length": 969.015625, + "completions/mean_terminated_length": 934.2096557617188, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.21142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42219310998916626, + "learning_rate": 1.1743223682775649e-07, + "loss": -0.0, + "num_tokens": 17806792.0, + "reward": 0.7470877766609192, + "reward_std": 0.5973426103591919, + "rewards/cosine_scaled_reward/mean": -0.11864358186721802, + "rewards/cosine_scaled_reward/std": 0.41184645891189575, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1667.0, + "completions/mean_length": 1076.984375, + "completions/mean_terminated_length": 938.2678833007812, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.21257142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30614498257637024, + "learning_rate": 1.1533337816991931e-07, + "loss": -0.0, + "num_tokens": 17886415.0, + "reward": 0.804481029510498, + "reward_std": 0.4629480838775635, + "rewards/cosine_scaled_reward/mean": -0.03525950014591217, + "rewards/cosine_scaled_reward/std": 0.45060867071151733, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3333333432674408, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1879.0, + "completions/mean_length": 1112.1875, + "completions/mean_terminated_length": 1049.800048828125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.21371428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4846937656402588, + "learning_rate": 1.1336692317580158e-07, + "loss": 0.0, + "num_tokens": 17968019.0, + "reward": 0.6981200575828552, + "reward_std": 0.53022301197052, + "rewards/cosine_scaled_reward/mean": -0.1275024712085724, + "rewards/cosine_scaled_reward/std": 0.38560083508491516, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21304203569889069, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1910.0, + "completions/mean_length": 1079.90625, + "completions/mean_terminated_length": 997.8643798828125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.21485714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34140780568122864, + "learning_rate": 1.1153347084664419e-07, + "loss": -0.0, + "num_tokens": 18048933.0, + "reward": 0.5326423645019531, + "reward_std": 0.5487440824508667, + "rewards/cosine_scaled_reward/mean": -0.22586631774902344, + "rewards/cosine_scaled_reward/std": 0.3085760772228241, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 868.546875, + "completions/mean_terminated_length": 830.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6607878804206848, + "learning_rate": 1.0983357966978745e-07, + "loss": -0.0, + "num_tokens": 18113808.0, + "reward": 0.7490335702896118, + "reward_std": 0.6654466390609741, + "rewards/cosine_scaled_reward/mean": -0.11767073720693588, + "rewards/cosine_scaled_reward/std": 0.4015049338340759, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1713.0, + "completions/mean_length": 938.8125, + "completions/mean_terminated_length": 903.0322265625, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.21714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3069080710411072, + "learning_rate": 1.0826776744855121e-07, + "loss": -0.0, + "num_tokens": 18183660.0, + "reward": 0.9838922023773193, + "reward_std": 0.5085676908493042, + "rewards/cosine_scaled_reward/mean": -0.00024138391017913818, + "rewards/cosine_scaled_reward/std": 0.44459760189056396, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1851.0, + "completions/max_terminated_length": 1851.0, + "completions/mean_length": 902.453125, + "completions/mean_terminated_length": 902.453125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.21828571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35081905126571655, + "learning_rate": 1.068365111445064e-07, + "loss": 0.0, + "num_tokens": 18251705.0, + "reward": 1.247175931930542, + "reward_std": 0.8716963529586792, + "rewards/cosine_scaled_reward/mean": 0.13140051066875458, + "rewards/cosine_scaled_reward/std": 0.5292099118232727, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1270.0, + "completions/mean_terminated_length": 1052.1600341796875, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "epoch": 0.21942857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2882588505744934, + "learning_rate": 1.0554024673218806e-07, + "loss": -0.0, + "num_tokens": 18344281.0, + "reward": 0.5913476943969727, + "reward_std": 0.6203497052192688, + "rewards/cosine_scaled_reward/mean": -0.11057613790035248, + "rewards/cosine_scaled_reward/std": 0.33690571784973145, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.39339789748191833, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1806.0, + "completions/mean_length": 1134.3125, + "completions/mean_terminated_length": 1022.1052856445312, + "completions/min_length": 475.0, + "completions/min_terminated_length": 475.0, + "epoch": 0.22057142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30495956540107727, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.0, + "num_tokens": 18428021.0, + "reward": 0.9724597930908203, + "reward_std": 0.6338238716125488, + "rewards/cosine_scaled_reward/mean": 0.025292381644248962, + "rewards/cosine_scaled_reward/std": 0.47308972477912903, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27048972249031067, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1284.34375, + "completions/mean_terminated_length": 1050.5714111328125, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.22171428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29666370153427124, + "learning_rate": 1.0335423176140511e-07, + "loss": -0.0, + "num_tokens": 18521579.0, + "reward": 0.970361590385437, + "reward_std": 0.8541973829269409, + "rewards/cosine_scaled_reward/mean": 0.055493295192718506, + "rewards/cosine_scaled_reward/std": 0.5139825344085693, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3503824472427368, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1614.0, + "completions/mean_length": 1111.140625, + "completions/mean_terminated_length": 957.8363037109375, + "completions/min_length": 612.0, + "completions/min_terminated_length": 612.0, + "epoch": 0.22285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2935192883014679, + "learning_rate": 1.0246514708427701e-07, + "loss": -0.0, + "num_tokens": 18603836.0, + "reward": 0.9238024353981018, + "reward_std": 0.7688024044036865, + "rewards/cosine_scaled_reward/mean": 0.008776212111115456, + "rewards/cosine_scaled_reward/std": 0.4346567392349243, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29378482699394226, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1913.0, + "completions/mean_length": 1110.28125, + "completions/mean_terminated_length": 1064.163818359375, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "epoch": 0.224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31850409507751465, + "learning_rate": 1.017123858587145e-07, + "loss": 0.0, + "num_tokens": 18686486.0, + "reward": 1.0064561367034912, + "reward_std": 0.6142268776893616, + "rewards/cosine_scaled_reward/mean": 0.0032280460000038147, + "rewards/cosine_scaled_reward/std": 0.4689313769340515, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1862.0, + "completions/max_terminated_length": 1862.0, + "completions/mean_length": 867.390625, + "completions/mean_terminated_length": 867.390625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.22514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36897119879722595, + "learning_rate": 1.0109617738307911e-07, + "loss": 0.0, + "num_tokens": 18752367.0, + "reward": 1.2200298309326172, + "reward_std": 0.7840542197227478, + "rewards/cosine_scaled_reward/mean": 0.11001493036746979, + "rewards/cosine_scaled_reward/std": 0.5105303525924683, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1018.171875, + "completions/mean_terminated_length": 911.637939453125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.22628571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33654487133026123, + "learning_rate": 1.0061670936044178e-07, + "loss": 0.0, + "num_tokens": 18829034.0, + "reward": 1.0653846263885498, + "reward_std": 0.7624523043632507, + "rewards/cosine_scaled_reward/mean": 0.04831730201840401, + "rewards/cosine_scaled_reward/std": 0.4961619973182678, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17536810040473938, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1170.84375, + "completions/mean_terminated_length": 1096.5084228515625, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "epoch": 0.22742857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28278952836990356, + "learning_rate": 1.002741278414069e-07, + "loss": 0.0, + "num_tokens": 18915472.0, + "reward": 0.6831471920013428, + "reward_std": 0.6951984167098999, + "rewards/cosine_scaled_reward/mean": -0.1506139189004898, + "rewards/cosine_scaled_reward/std": 0.34608688950538635, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.125, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1764.0, + "completions/mean_length": 999.390625, + "completions/mean_terminated_length": 849.5892944335938, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "epoch": 0.22857142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28817513585090637, + "learning_rate": 1.0006853717962393e-07, + "loss": 0.0, + "num_tokens": 18989553.0, + "reward": 0.9030377864837646, + "reward_std": 0.8171917200088501, + "rewards/cosine_scaled_reward/mean": -0.01723114401102066, + "rewards/cosine_scaled_reward/std": 0.4829805791378021, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24397502839565277, + "step": 200 + }, + { + "epoch": 0.22857142857142856, + "step": 200, + "total_flos": 0.0, + "train_loss": 3.2957177609205244e-09, + "train_runtime": 10011.2078, + "train_samples_per_second": 1.279, + "train_steps_per_second": 0.02 + } + ], + "logging_steps": 1, + "max_steps": 200, + "num_input_tokens_seen": 18989553, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..9e03ee7 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec4fbe7e29ae78abab9c9fd5555870c1bffe7656ceef4ac5fa2148a15b61b1e3 +size 8888