From d1d83abf726be614b460556fb1f71bfa5fef4788 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Sat, 30 May 2026 02:44:18 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: yuerxin/DeepSeek-R1-Distill-Qwen-1.5B-GRPO Source: Original Platform --- .gitattributes | 36 + README.md | 70 + all_results.json | 8 + chat_template.jinja | 1 + config.json | 29 + generation_config.json | 9 + model.safetensors | 3 + special_tokens_map.json | 23 + tokenizer.json | 3 + tokenizer_config.json | 194 +++ train_results.json | 8 + trainer_state.json | 3043 +++++++++++++++++++++++++++++++++++++++ training_args.bin | 3 + 13 files changed, 3430 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 all_results.json create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_results.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..141d2d3 --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +datasets: open-r1/OpenR1-Math-220k +library_name: transformers +model_name: DeepSeek-R1-Distill-Qwen-1.5B-GRPO +tags: +- generated_from_trainer +- open-r1 +- trl +- grpo +licence: license +--- + +# Model Card for DeepSeek-R1-Distill-Qwen-1.5B-GRPO + +This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) on the [open-r1/OpenR1-Math-220k](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="Sayram/DeepSeek-R1-Distill-Qwen-1.5B-GRPO", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/erxin/huggingface/runs/yu4p6fnd) + + +This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300). + +### Framework versions + +- TRL: 0.18.0 +- Transformers: 4.52.3 +- Pytorch: 2.6.0 +- Datasets: 4.1.1 +- Tokenizers: 0.21.4 + +## Citations + +Cite GRPO as: + +```bibtex +@article{zhihong2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, +} + +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..85dd418 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": 0.0464448650211034, + "train_runtime": 1230.2242, + "train_samples": 100, + "train_samples_per_second": 0.081, + "train_steps_per_second": 0.081 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..05417b8 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..d97e3dc --- /dev/null +++ b/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": 4096, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.52.3", + "use_cache": false, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..6b5b266 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 151646, + "do_sample": true, + "eos_token_id": 151643, + "temperature": 0.6, + "top_p": 0.95, + "transformers_version": "4.52.3" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..8de13ab --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b659efa3f76bd7739659515e60d266bb5224dcb958f4bc9d5c4798fa2d712e2 +size 3554214752 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..1d385d6 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..e7cd2c1 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a +size 11422959 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..d252dd4 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..85dd418 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": 0.0464448650211034, + "train_runtime": 1230.2242, + "train_samples": 100, + "train_samples_per_second": 0.081, + "train_steps_per_second": 0.081 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..9a165cd --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,3043 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1627.0, + "completions/mean_length": 1840.125, + "completions/mean_terminated_length": 1493.666748046875, + "completions/min_length": 1381.0, + "completions/min_terminated_length": 1381.0, + "epoch": 0.01, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6628252342045541, + "kl": 0.0004730224609375, + "learning_rate": 0.0, + "loss": 0.0828, + "num_tokens": 15585.0, + "reward": 0.53125, + "reward_std": 0.41052013635635376, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.40625, + "rewards/tag_count_reward/std": 0.12938730418682098, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1674.0, + "completions/mean_length": 1917.875, + "completions/mean_terminated_length": 1527.5, + "completions/min_length": 1381.0, + "completions/min_terminated_length": 1381.0, + "epoch": 0.02, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7317782492587136, + "kl": 0.00067138671875, + "learning_rate": 1e-07, + "loss": 0.069, + "num_tokens": 32088.0, + "reward": 0.375, + "reward_std": 0.13363061845302582, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.375, + "rewards/tag_count_reward/std": 0.13363061845302582, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1641.0, + "completions/mean_length": 1356.875, + "completions/mean_terminated_length": 1126.5, + "completions/min_length": 625.0, + "completions/min_terminated_length": 625.0, + "epoch": 0.03, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9091411567234831, + "kl": 0.0005397796630859375, + "learning_rate": 2e-07, + "loss": 0.039, + "num_tokens": 44431.0, + "reward": 0.46875, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.46875, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7902192806756456, + "kl": 0.000720977783203125, + "learning_rate": 3e-07, + "loss": 0.0, + "num_tokens": 62215.0, + "reward": 0.28125, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.28125, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1580.0, + "completions/mean_length": 1989.5, + "completions/mean_terminated_length": 1580.0, + "completions/min_length": 1580.0, + "completions/min_terminated_length": 1580.0, + "epoch": 0.05, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4606638644327677, + "kl": 0.00045490264892578125, + "learning_rate": 4e-07, + "loss": 0.0382, + "num_tokens": 79059.0, + "reward": 0.5, + "reward_std": 0.4225771427154541, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.375, + "rewards/tag_count_reward/std": 0.13363061845302582, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.06, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0036145388208231963, + "kl": 0.0007953643798828125, + "learning_rate": 5e-07, + "loss": 0.0, + "num_tokens": 96667.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25, + "rewards/tag_count_reward/std": 0.0, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1741.0, + "completions/mean_length": 1504.875, + "completions/mean_terminated_length": 1427.2857666015625, + "completions/min_length": 1116.0, + "completions/min_terminated_length": 1116.0, + "epoch": 0.07, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003359971830136702, + "kl": 0.000370025634765625, + "learning_rate": 6e-07, + "loss": 0.0, + "num_tokens": 110562.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5, + "rewards/tag_count_reward/std": 0.0, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.0, + "completions/max_length": 1081.0, + "completions/max_terminated_length": 1081.0, + "completions/mean_length": 833.625, + "completions/mean_terminated_length": 833.625, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "epoch": 0.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8406033348743903, + "kl": 0.000423431396484375, + "learning_rate": 7e-07, + "loss": 0.0395, + "num_tokens": 118143.0, + "reward": 0.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5625, + "rewards/tag_count_reward/std": 0.1767766922712326, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1883.0, + "completions/mean_length": 1935.375, + "completions/mean_terminated_length": 1597.5, + "completions/min_length": 1312.0, + "completions/min_terminated_length": 1312.0, + "epoch": 0.09, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6507369858144503, + "kl": 0.0004940032958984375, + "learning_rate": 8e-07, + "loss": 0.0704, + "num_tokens": 134458.0, + "reward": 0.3125, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3125, + "rewards/tag_count_reward/std": 0.1157275140285492, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1694.0, + "completions/mean_length": 1947.0, + "completions/mean_terminated_length": 1644.0, + "completions/min_length": 1594.0, + "completions/min_terminated_length": 1594.0, + "epoch": 0.1, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9403904262794942, + "kl": 0.000942230224609375, + "learning_rate": 9e-07, + "loss": 0.0548, + "num_tokens": 151226.0, + "reward": 0.375, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.375, + "rewards/tag_count_reward/std": 0.26726123690605164, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003568944942064051, + "kl": 0.0007343292236328125, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 168538.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25, + "rewards/tag_count_reward/std": 0.0, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 745.375, + "completions/mean_terminated_length": 745.375, + "completions/min_length": 604.0, + "completions/min_terminated_length": 604.0, + "epoch": 0.12, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006363067666064702, + "kl": 0.00060272216796875, + "learning_rate": 9.997258721585931e-07, + "loss": 0.0, + "num_tokens": 175093.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5, + "rewards/tag_count_reward/std": 0.0, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0029190106524072316, + "kl": 0.0005397796630859375, + "learning_rate": 9.989038226169207e-07, + "loss": 0.0, + "num_tokens": 192629.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25, + "rewards/tag_count_reward/std": 0.0, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1842.0, + "completions/mean_length": 1707.875, + "completions/mean_terminated_length": 1594.5, + "completions/min_length": 1003.0, + "completions/min_terminated_length": 1003.0, + "epoch": 0.14, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6809709230610892, + "kl": 0.00041961669921875, + "learning_rate": 9.975348529157229e-07, + "loss": 0.121, + "num_tokens": 207108.0, + "reward": 0.46875, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.46875, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028157580161607284, + "kl": 0.00044918060302734375, + "learning_rate": 9.956206309337066e-07, + "loss": 0.0, + "num_tokens": 224476.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25, + "rewards/tag_count_reward/std": 0.0, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1748.0, + "completions/mean_length": 1521.0, + "completions/mean_terminated_length": 1345.3333740234375, + "completions/min_length": 944.0, + "completions/min_terminated_length": 944.0, + "epoch": 0.16, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0032241372357074316, + "kl": 0.0003681182861328125, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0, + "num_tokens": 237716.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5, + "rewards/tag_count_reward/std": 0.0, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 2038.875, + "completions/mean_terminated_length": 1975.0, + "completions/min_length": 1975.0, + "completions/min_terminated_length": 1975.0, + "epoch": 0.17, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5238026110591891, + "kl": 0.00043487548828125, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0064, + "num_tokens": 254851.0, + "reward": 0.34375, + "reward_std": 0.2651650309562683, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.34375, + "rewards/tag_count_reward/std": 0.2651650309562683, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 1472.875, + "completions/mean_terminated_length": 1390.71435546875, + "completions/min_length": 948.0, + "completions/min_terminated_length": 948.0, + "epoch": 0.18, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6969142375215345, + "kl": 0.000553131103515625, + "learning_rate": 9.866330768241983e-07, + "loss": 0.0285, + "num_tokens": 267802.0, + "reward": 0.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5625, + "rewards/tag_count_reward/std": 0.1767766922712326, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006170385130050486, + "kl": 0.00032806396484375, + "learning_rate": 9.825677631722435e-07, + "loss": 0.0, + "num_tokens": 285002.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25, + "rewards/tag_count_reward/std": 0.0, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1876.0, + "completions/mean_length": 1872.875, + "completions/mean_terminated_length": 1697.75, + "completions/min_length": 1359.0, + "completions/min_terminated_length": 1359.0, + "epoch": 0.2, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5924705799517846, + "kl": 0.000545501708984375, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0592, + "num_tokens": 301433.0, + "reward": 0.78125, + "reward_std": 0.60411536693573, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.40625, + "rewards/tag_count_reward/std": 0.12938730418682098, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.0, + "completions/max_length": 1238.0, + "completions/max_terminated_length": 1238.0, + "completions/mean_length": 718.75, + "completions/mean_terminated_length": 718.75, + "completions/min_length": 482.0, + "completions/min_terminated_length": 482.0, + "epoch": 0.21, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.076208309541439, + "kl": 0.000446319580078125, + "learning_rate": 9.728616793536587e-07, + "loss": -0.0477, + "num_tokens": 308647.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5, + "rewards/tag_count_reward/std": 0.0, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.0, + "completions/max_length": 1146.0, + "completions/max_terminated_length": 1146.0, + "completions/mean_length": 751.0, + "completions/mean_terminated_length": 751.0, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.22, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6075613594297673, + "kl": 0.0004673004150390625, + "learning_rate": 9.672327345550543e-07, + "loss": -0.2154, + "num_tokens": 315719.0, + "reward": 0.6875, + "reward_std": 0.5303300619125366, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5625, + "rewards/tag_count_reward/std": 0.1767766922712326, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1330.0, + "completions/mean_length": 1307.125, + "completions/mean_terminated_length": 862.6000366210938, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "epoch": 0.23, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8767085399237937, + "kl": 0.0005292892456054688, + "learning_rate": 9.610954559391704e-07, + "loss": 0.2683, + "num_tokens": 327448.0, + "reward": 0.96875, + "reward_std": 0.5737953186035156, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.46875, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6467284427973272, + "kl": 0.0005121231079101562, + "learning_rate": 9.54457320834625e-07, + "loss": 0.0, + "num_tokens": 344728.0, + "reward": 0.28125, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.28125, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1745.0, + "completions/mean_terminated_length": 1442.0, + "completions/min_length": 918.0, + "completions/min_terminated_length": 918.0, + "epoch": 0.25, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6364191810172123, + "kl": 0.00034999847412109375, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0907, + "num_tokens": 359472.0, + "reward": 0.375, + "reward_std": 0.13363061845302582, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.375, + "rewards/tag_count_reward/std": 0.13363061845302582, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.0, + "completions/max_length": 1109.0, + "completions/max_terminated_length": 1109.0, + "completions/mean_length": 860.125, + "completions/mean_terminated_length": 860.125, + "completions/min_length": 560.0, + "completions/min_terminated_length": 560.0, + "epoch": 0.26, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9557565788184962, + "kl": 0.0004291534423828125, + "learning_rate": 9.397114317029974e-07, + "loss": -0.0065, + "num_tokens": 367057.0, + "reward": 0.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5, + "rewards/tag_count_reward/std": 0.0, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003114398061108756, + "kl": 0.0005168914794921875, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0, + "num_tokens": 384721.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25, + "rewards/tag_count_reward/std": 0.0, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1633.625, + "completions/mean_terminated_length": 1495.5, + "completions/min_length": 832.0, + "completions/min_terminated_length": 832.0, + "epoch": 0.28, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8883130168762549, + "kl": 0.000690460205078125, + "learning_rate": 9.230669076497687e-07, + "loss": 0.0, + "num_tokens": 398990.0, + "reward": 0.4375, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4375, + "rewards/tag_count_reward/std": 0.1157275140285492, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1646.0, + "completions/mean_length": 1305.875, + "completions/mean_terminated_length": 1058.5, + "completions/min_length": 766.0, + "completions/min_terminated_length": 766.0, + "epoch": 0.29, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8508879198267415, + "kl": 0.00045108795166015625, + "learning_rate": 9.140576474687263e-07, + "loss": 0.2107, + "num_tokens": 410989.0, + "reward": 0.4375, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4375, + "rewards/tag_count_reward/std": 0.1157275140285492, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 319.875, + "completions/mean_terminated_length": 319.875, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.3, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009396867221837423, + "kl": 0.000469207763671875, + "learning_rate": 9.046048391230247e-07, + "loss": 0.0, + "num_tokens": 414372.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5, + "rewards/tag_count_reward/std": 0.0, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1928.0, + "completions/mean_length": 1646.625, + "completions/mean_terminated_length": 1512.8333740234375, + "completions/min_length": 1172.0, + "completions/min_terminated_length": 1172.0, + "epoch": 0.31, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.677636559478867, + "kl": 0.0004253387451171875, + "learning_rate": 8.9471999940354e-07, + "loss": 0.0789, + "num_tokens": 429153.0, + "reward": 0.4375, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4375, + "rewards/tag_count_reward/std": 0.1157275140285492, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.32, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42518375588544877, + "kl": 0.000492095947265625, + "learning_rate": 8.844151714648274e-07, + "loss": 0.0, + "num_tokens": 446505.0, + "reward": 0.28125, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.28125, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1485.0, + "completions/mean_length": 1977.625, + "completions/mean_terminated_length": 1485.0, + "completions/min_length": 1485.0, + "completions/min_terminated_length": 1485.0, + "epoch": 0.33, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7573312728335488, + "kl": 0.00067901611328125, + "learning_rate": 8.737029101523929e-07, + "loss": 0.0563, + "num_tokens": 463614.0, + "reward": 0.28125, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.28125, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.34, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0031934478388464358, + "kl": 0.000537872314453125, + "learning_rate": 8.625962667065487e-07, + "loss": 0.0, + "num_tokens": 481382.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25, + "rewards/tag_count_reward/std": 0.0, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6214744310551946, + "kl": 0.00048160552978515625, + "learning_rate": 8.511087728614862e-07, + "loss": 0.0, + "num_tokens": 498622.0, + "reward": 0.28125, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.28125, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.0, + "completions/max_length": 1532.0, + "completions/max_terminated_length": 1532.0, + "completions/mean_length": 792.625, + "completions/mean_terminated_length": 792.625, + "completions/min_length": 586.0, + "completions/min_terminated_length": 586.0, + "epoch": 0.36, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8848488126874285, + "kl": 0.0005741119384765625, + "learning_rate": 8.392544243589427e-07, + "loss": -0.0344, + "num_tokens": 506091.0, + "reward": 0.9375, + "reward_std": 0.47715675830841064, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6875, + "rewards/tag_count_reward/std": 0.22160132229328156, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0037357667583392553, + "kl": 0.000514984130859375, + "learning_rate": 8.270476638965461e-07, + "loss": 0.0, + "num_tokens": 523475.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25, + "rewards/tag_count_reward/std": 0.0, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1437.0, + "completions/mean_length": 1464.875, + "completions/mean_terminated_length": 1115.0, + "completions/min_length": 751.0, + "completions/min_terminated_length": 751.0, + "epoch": 0.38, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9255650045041091, + "kl": 0.0004863739013671875, + "learning_rate": 8.145033635316128e-07, + "loss": 0.242, + "num_tokens": 536562.0, + "reward": 0.4375, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4375, + "rewards/tag_count_reward/std": 0.1157275140285492, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1930.0, + "completions/mean_length": 1548.25, + "completions/mean_terminated_length": 1476.857177734375, + "completions/min_length": 996.0, + "completions/min_terminated_length": 996.0, + "epoch": 0.39, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7119712141872488, + "kl": 0.0004978179931640625, + "learning_rate": 8.01636806561836e-07, + "loss": 0.0712, + "num_tokens": 550620.0, + "reward": 0.46875, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.46875, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0037004750751766843, + "kl": 0.000919342041015625, + "learning_rate": 7.884636689049422e-07, + "loss": 0.0, + "num_tokens": 568516.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25, + "rewards/tag_count_reward/std": 0.0, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1684.25, + "completions/mean_terminated_length": 1320.5, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "epoch": 0.41, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6765909685421715, + "kl": 0.0006198883056640625, + "learning_rate": 7.75e-07, + "loss": 0.2158, + "num_tokens": 582934.0, + "reward": 0.375, + "reward_std": 0.13363061845302582, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.375, + "rewards/tag_count_reward/std": 0.13363061845302582, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1233.0, + "completions/mean_length": 1946.125, + "completions/mean_terminated_length": 1233.0, + "completions/min_length": 1233.0, + "completions/min_terminated_length": 1233.0, + "epoch": 0.42, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6782057995689209, + "kl": 0.000667572021484375, + "learning_rate": 7.612622032536507e-07, + "loss": 0.0878, + "num_tokens": 600151.0, + "reward": 0.28125, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.28125, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.43, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003069913511462463, + "kl": 0.0005235671997070312, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0, + "num_tokens": 617599.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25, + "rewards/tag_count_reward/std": 0.0, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1941.0, + "completions/mean_length": 2034.625, + "completions/mean_terminated_length": 1941.0, + "completions/min_length": 1941.0, + "completions/min_terminated_length": 1941.0, + "epoch": 0.44, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47430574211354576, + "kl": 0.00039577484130859375, + "learning_rate": 7.330314893841101e-07, + "loss": 0.0073, + "num_tokens": 634780.0, + "reward": 0.3125, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3125, + "rewards/tag_count_reward/std": 0.1157275140285492, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1902.0, + "completions/mean_length": 1884.75, + "completions/mean_terminated_length": 1830.3333740234375, + "completions/min_length": 1705.0, + "completions/min_terminated_length": 1705.0, + "epoch": 0.45, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7613443731007656, + "kl": 0.0006561279296875, + "learning_rate": 7.185729670371604e-07, + "loss": 0.0207, + "num_tokens": 652242.0, + "reward": 0.4375, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4375, + "rewards/tag_count_reward/std": 0.1157275140285492, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1234.0, + "completions/mean_length": 1147.75, + "completions/mean_terminated_length": 847.6666870117188, + "completions/min_length": 612.0, + "completions/min_terminated_length": 612.0, + "epoch": 0.46, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0778276077011943, + "kl": 0.00066375732421875, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0154, + "num_tokens": 662560.0, + "reward": 0.8125, + "reward_std": 0.5786375403404236, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4375, + "rewards/tag_count_reward/std": 0.1157275140285492, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.0, + "completions/max_length": 1327.0, + "completions/max_terminated_length": 1327.0, + "completions/mean_length": 736.25, + "completions/mean_terminated_length": 736.25, + "completions/min_length": 482.0, + "completions/min_terminated_length": 482.0, + "epoch": 0.47, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7259986577953768, + "kl": 0.00115966796875, + "learning_rate": 6.890576474687263e-07, + "loss": -0.0495, + "num_tokens": 669274.0, + "reward": 0.9375, + "reward_std": 0.4172614812850952, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6875, + "rewards/tag_count_reward/std": 0.25877460837364197, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.0, + "completions/max_length": 1397.0, + "completions/max_terminated_length": 1397.0, + "completions/mean_length": 1060.125, + "completions/mean_terminated_length": 1060.125, + "completions/min_length": 685.0, + "completions/min_terminated_length": 685.0, + "epoch": 0.48, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6442195673577423, + "kl": 0.0004329681396484375, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0696, + "num_tokens": 678611.0, + "reward": 0.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5625, + "rewards/tag_count_reward/std": 0.1767766922712326, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006123254776481617, + "kl": 0.00051116943359375, + "learning_rate": 6.588648530198504e-07, + "loss": 0.0, + "num_tokens": 696635.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25, + "rewards/tag_count_reward/std": 0.0, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1466.0, + "completions/mean_length": 1975.25, + "completions/mean_terminated_length": 1466.0, + "completions/min_length": 1466.0, + "completions/min_terminated_length": 1466.0, + "epoch": 0.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6336529820312526, + "kl": 0.0006256103515625, + "learning_rate": 6.435602608679916e-07, + "loss": 0.0585, + "num_tokens": 713709.0, + "reward": 0.28125, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.28125, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 1899.0, + "completions/mean_terminated_length": 856.0, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "epoch": 0.51, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49637328343233095, + "kl": 0.0006580352783203125, + "learning_rate": 6.281416799501187e-07, + "loss": 0.1451, + "num_tokens": 729901.0, + "reward": 0.40625, + "reward_std": 0.4419417381286621, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.28125, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 2043.625, + "completions/mean_terminated_length": 2013.0, + "completions/min_length": 2013.0, + "completions/min_terminated_length": 2013.0, + "epoch": 0.52, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7195590756820243, + "kl": 0.00067138671875, + "learning_rate": 6.126278954320294e-07, + "loss": 0.0031, + "num_tokens": 747090.0, + "reward": 0.28125, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.28125, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.0, + "completions/max_length": 901.0, + "completions/max_terminated_length": 901.0, + "completions/mean_length": 565.5, + "completions/mean_terminated_length": 565.5, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.53, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031718782447743454, + "kl": 0.0008029937744140625, + "learning_rate": 5.97037808470444e-07, + "loss": 0.0, + "num_tokens": 752910.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5, + "rewards/tag_count_reward/std": 0.0, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 1898.375, + "completions/mean_terminated_length": 851.0, + "completions/min_length": 851.0, + "completions/min_terminated_length": 851.0, + "epoch": 0.54, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6988079348841579, + "kl": 0.0009441375732421875, + "learning_rate": 5.813904131848564e-07, + "loss": 0.1114, + "num_tokens": 769249.0, + "reward": 0.3125, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3125, + "rewards/tag_count_reward/std": 0.1157275140285492, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1934.0, + "completions/mean_length": 1715.875, + "completions/mean_terminated_length": 1383.75, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "epoch": 0.55, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7609580950694678, + "kl": 0.0007781982421875, + "learning_rate": 5.657047735161255e-07, + "loss": 0.1302, + "num_tokens": 783920.0, + "reward": 0.90625, + "reward_std": 0.6399986147880554, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.40625, + "rewards/tag_count_reward/std": 0.12938730418682098, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 829.625, + "completions/mean_terminated_length": 655.5714721679688, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.56, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1927365377038142, + "kl": 0.001590728759765625, + "learning_rate": 5.5e-07, + "loss": 0.0939, + "num_tokens": 791709.0, + "reward": 0.53125, + "reward_std": 0.2086307406425476, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.53125, + "rewards/tag_count_reward/std": 0.2086307406425476, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1331.0, + "completions/mean_length": 994.5, + "completions/mean_terminated_length": 844.0000610351562, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.57, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.182121142240305, + "kl": 0.0005893707275390625, + "learning_rate": 5.342952264838747e-07, + "loss": 0.0, + "num_tokens": 801057.0, + "reward": 0.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.625, + "rewards/tag_count_reward/std": 0.2314550280570984, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.0, + "completions/max_length": 1057.0, + "completions/max_terminated_length": 1057.0, + "completions/mean_length": 834.25, + "completions/mean_terminated_length": 834.25, + "completions/min_length": 659.0, + "completions/min_terminated_length": 659.0, + "epoch": 0.58, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8842736658535645, + "kl": 0.0005855560302734375, + "learning_rate": 5.186095868151436e-07, + "loss": 0.0177, + "num_tokens": 809131.0, + "reward": 0.875, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5, + "rewards/tag_count_reward/std": 0.0, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.0, + "completions/max_length": 1438.0, + "completions/max_terminated_length": 1438.0, + "completions/mean_length": 580.25, + "completions/mean_terminated_length": 580.25, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.59, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.947212959421752, + "kl": 0.000576019287109375, + "learning_rate": 5.02962191529556e-07, + "loss": 0.1389, + "num_tokens": 814829.0, + "reward": 0.9375, + "reward_std": 0.45806270837783813, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6875, + "rewards/tag_count_reward/std": 0.1767766922712326, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1808.0, + "completions/mean_length": 1823.875, + "completions/mean_terminated_length": 1450.3333740234375, + "completions/min_length": 1205.0, + "completions/min_terminated_length": 1205.0, + "epoch": 0.6, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7296721993472337, + "kl": 0.0008335113525390625, + "learning_rate": 4.873721045679706e-07, + "loss": 0.1096, + "num_tokens": 831916.0, + "reward": 0.375, + "reward_std": 0.13363061845302582, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.375, + "rewards/tag_count_reward/std": 0.13363061845302582, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1916.0, + "completions/mean_length": 1570.0, + "completions/mean_terminated_length": 1410.666748046875, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "epoch": 0.61, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6173214568936407, + "kl": 0.000675201416015625, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.0118, + "num_tokens": 845372.0, + "reward": 0.46875, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.46875, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1746.5, + "completions/mean_terminated_length": 1445.0, + "completions/min_length": 1093.0, + "completions/min_terminated_length": 1093.0, + "epoch": 0.62, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6996417102286492, + "kl": 0.0004863739013671875, + "learning_rate": 4.5643973913200837e-07, + "loss": 0.0543, + "num_tokens": 860464.0, + "reward": 0.375, + "reward_std": 0.13363061845302582, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.375, + "rewards/tag_count_reward/std": 0.13363061845302582, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1941.0, + "completions/mean_length": 1849.75, + "completions/mean_terminated_length": 1651.5, + "completions/min_length": 1406.0, + "completions/min_terminated_length": 1406.0, + "epoch": 0.63, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5518151553173074, + "kl": 0.00035190582275390625, + "learning_rate": 4.4113514698014953e-07, + "loss": 0.0645, + "num_tokens": 876246.0, + "reward": 0.4375, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4375, + "rewards/tag_count_reward/std": 0.1157275140285492, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 1869.0, + "completions/mean_terminated_length": 1332.0, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "epoch": 0.64, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7947269366057148, + "kl": 0.00045108795166015625, + "learning_rate": 4.2596318988235037e-07, + "loss": 0.1176, + "num_tokens": 892662.0, + "reward": 0.375, + "reward_std": 0.13363061845302582, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.375, + "rewards/tag_count_reward/std": 0.13363061845302582, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 1797.0, + "completions/mean_terminated_length": 1378.666748046875, + "completions/min_length": 662.0, + "completions/min_terminated_length": 662.0, + "epoch": 0.65, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7873341627366034, + "kl": 0.001064300537109375, + "learning_rate": 4.1094235253127374e-07, + "loss": 0.1647, + "num_tokens": 908198.0, + "reward": 0.34375, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.34375, + "rewards/tag_count_reward/std": 0.12938730418682098, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1401.0, + "completions/mean_length": 1967.125, + "completions/mean_terminated_length": 1401.0, + "completions/min_length": 1401.0, + "completions/min_terminated_length": 1401.0, + "epoch": 0.66, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6108272802669615, + "kl": 0.0005245208740234375, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0663, + "num_tokens": 924743.0, + "reward": 0.40625, + "reward_std": 0.4419417381286621, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.28125, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.0, + "completions/max_length": 2007.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1388.0, + "completions/mean_terminated_length": 1388.0, + "completions/min_length": 1120.0, + "completions/min_terminated_length": 1120.0, + "epoch": 0.67, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6923989772553029, + "kl": 0.0004787445068359375, + "learning_rate": 3.8142703296283953e-07, + "loss": 0.0752, + "num_tokens": 937455.0, + "reward": 0.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.625, + "rewards/tag_count_reward/std": 0.2314550280570984, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.68, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5804438368543029, + "kl": 0.0006103515625, + "learning_rate": 3.6696851061588994e-07, + "loss": 0.0, + "num_tokens": 957231.0, + "reward": 0.5625, + "reward_std": 0.5786375403404236, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3125, + "rewards/tag_count_reward/std": 0.1157275140285492, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.69, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.602172715608051, + "kl": 0.0004863739013671875, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0, + "num_tokens": 974775.0, + "reward": 0.28125, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.28125, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1173.0, + "completions/mean_length": 1111.75, + "completions/mean_terminated_length": 978.0000610351562, + "completions/min_length": 717.0, + "completions/min_terminated_length": 717.0, + "epoch": 0.7, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5800710876181367, + "kl": 0.0004787445068359375, + "learning_rate": 3.387377967463493e-07, + "loss": 0.0362, + "num_tokens": 985317.0, + "reward": 0.59375, + "reward_std": 0.376485139131546, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.46875, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 1826.75, + "completions/mean_terminated_length": 1605.5, + "completions/min_length": 892.0, + "completions/min_terminated_length": 892.0, + "epoch": 0.71, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8588425272525544, + "kl": 0.001007080078125, + "learning_rate": 3.250000000000001e-07, + "loss": 0.025, + "num_tokens": 1001051.0, + "reward": 0.5, + "reward_std": 0.4225771427154541, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.375, + "rewards/tag_count_reward/std": 0.13363061845302582, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1592.0, + "completions/mean_length": 1284.0, + "completions/mean_terminated_length": 1174.857177734375, + "completions/min_length": 953.0, + "completions/min_terminated_length": 953.0, + "epoch": 0.72, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013235371533423931, + "kl": 0.0008678436279296875, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0, + "num_tokens": 1012123.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5, + "rewards/tag_count_reward/std": 0.0, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1916.25, + "completions/mean_terminated_length": 1696.666748046875, + "completions/min_length": 1519.0, + "completions/min_terminated_length": 1519.0, + "epoch": 0.73, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5474968619673198, + "kl": 0.000751495361328125, + "learning_rate": 2.9836319343816397e-07, + "loss": 0.0, + "num_tokens": 1028421.0, + "reward": 0.65625, + "reward_std": 0.5334774851799011, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.40625, + "rewards/tag_count_reward/std": 0.12938730418682098, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.0, + "completions/max_length": 1883.0, + "completions/max_terminated_length": 1883.0, + "completions/mean_length": 1288.75, + "completions/mean_terminated_length": 1288.75, + "completions/min_length": 764.0, + "completions/min_terminated_length": 764.0, + "epoch": 0.74, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003456095198800055, + "kl": 0.0003814697265625, + "learning_rate": 2.854966364683872e-07, + "loss": 0.0, + "num_tokens": 1039587.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5, + "rewards/tag_count_reward/std": 0.0, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1668.0, + "completions/mean_length": 1781.0, + "completions/mean_terminated_length": 1514.0, + "completions/min_length": 1256.0, + "completions/min_terminated_length": 1256.0, + "epoch": 0.75, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.548319750098783, + "kl": 0.00042057037353515625, + "learning_rate": 2.729523361034538e-07, + "loss": 0.0938, + "num_tokens": 1054939.0, + "reward": 0.9375, + "reward_std": 0.6087164282798767, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4375, + "rewards/tag_count_reward/std": 0.1157275140285492, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.76, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028496725166757297, + "kl": 0.00051116943359375, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0, + "num_tokens": 1072235.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25, + "rewards/tag_count_reward/std": 0.0, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1943.0, + "completions/mean_length": 1953.375, + "completions/mean_terminated_length": 1669.5, + "completions/min_length": 1396.0, + "completions/min_terminated_length": 1396.0, + "epoch": 0.77, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.706431481153136, + "kl": 0.000545501708984375, + "learning_rate": 2.488912271385139e-07, + "loss": 0.0582, + "num_tokens": 1089334.0, + "reward": 0.3125, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3125, + "rewards/tag_count_reward/std": 0.1157275140285492, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1759.0, + "completions/mean_length": 1863.25, + "completions/mean_terminated_length": 1555.3333740234375, + "completions/min_length": 1322.0, + "completions/min_terminated_length": 1322.0, + "epoch": 0.78, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7026848252787423, + "kl": 0.0008983612060546875, + "learning_rate": 2.374037332934512e-07, + "loss": 0.1001, + "num_tokens": 1105592.0, + "reward": 0.40625, + "reward_std": 0.2651650309562683, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.40625, + "rewards/tag_count_reward/std": 0.2651650309562683, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.0, + "completions/max_length": 1333.0, + "completions/max_terminated_length": 1333.0, + "completions/mean_length": 1021.625, + "completions/mean_terminated_length": 1021.625, + "completions/min_length": 679.0, + "completions/min_terminated_length": 679.0, + "epoch": 0.79, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004479904510744182, + "kl": 0.0004482269287109375, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.0, + "num_tokens": 1114469.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5, + "rewards/tag_count_reward/std": 0.0, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012713748507087811, + "kl": 0.000774383544921875, + "learning_rate": 2.1558482853517253e-07, + "loss": 0.0, + "num_tokens": 1132061.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25, + "rewards/tag_count_reward/std": 0.0, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.81, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009237916379458392, + "kl": 0.00116729736328125, + "learning_rate": 2.0528000059645995e-07, + "loss": 0.0, + "num_tokens": 1149405.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25, + "rewards/tag_count_reward/std": 0.0, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.82, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.731282287844213, + "kl": 0.00102996826171875, + "learning_rate": 1.9539516087697517e-07, + "loss": 0.0, + "num_tokens": 1167485.0, + "reward": 0.28125, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.28125, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1497.0, + "completions/mean_length": 1806.375, + "completions/mean_terminated_length": 1403.666748046875, + "completions/min_length": 1279.0, + "completions/min_terminated_length": 1279.0, + "epoch": 0.83, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6696391690257025, + "kl": 0.0007190704345703125, + "learning_rate": 1.8594235253127372e-07, + "loss": 0.1358, + "num_tokens": 1182888.0, + "reward": 0.34375, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.34375, + "rewards/tag_count_reward/std": 0.12938730418682098, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1856.375, + "completions/mean_terminated_length": 1537.0, + "completions/min_length": 1284.0, + "completions/min_terminated_length": 1284.0, + "epoch": 0.84, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8520445078964867, + "kl": 0.00098419189453125, + "learning_rate": 1.7693309235023127e-07, + "loss": 0.1106, + "num_tokens": 1198539.0, + "reward": 0.34375, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.34375, + "rewards/tag_count_reward/std": 0.12938730418682098, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1619.0, + "completions/mean_length": 1552.75, + "completions/mean_terminated_length": 1387.666748046875, + "completions/min_length": 1177.0, + "completions/min_terminated_length": 1177.0, + "epoch": 0.85, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6415780475709305, + "kl": 0.00045013427734375, + "learning_rate": 1.6837835672960831e-07, + "loss": 0.1278, + "num_tokens": 1211897.0, + "reward": 0.4375, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4375, + "rewards/tag_count_reward/std": 0.1157275140285492, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1785.0, + "completions/mean_length": 1301.75, + "completions/mean_terminated_length": 854.0, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.86, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2846817650924338, + "kl": 0.0011138916015625, + "learning_rate": 1.6028856829700258e-07, + "loss": 0.3474, + "num_tokens": 1223335.0, + "reward": 0.4375, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4375, + "rewards/tag_count_reward/std": 0.1157275140285492, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 1210.625, + "completions/mean_terminated_length": 1091.0, + "completions/min_length": 543.0, + "completions/min_terminated_length": 543.0, + "epoch": 0.87, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7215403346916941, + "kl": 0.0008296966552734375, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.0492, + "num_tokens": 1233980.0, + "reward": 0.53125, + "reward_std": 0.2086307406425476, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.53125, + "rewards/tag_count_reward/std": 0.2086307406425476, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 1143.375, + "completions/mean_terminated_length": 841.8333740234375, + "completions/min_length": 740.0, + "completions/min_terminated_length": 740.0, + "epoch": 0.88, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.888295640678355, + "kl": 0.0009326934814453125, + "learning_rate": 1.4554267916537495e-07, + "loss": 0.203, + "num_tokens": 1244423.0, + "reward": 0.5625, + "reward_std": 0.2912411689758301, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5625, + "rewards/tag_count_reward/std": 0.29124119877815247, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.0, + "completions/max_length": 1903.0, + "completions/max_terminated_length": 1903.0, + "completions/mean_length": 1234.0, + "completions/mean_terminated_length": 1234.0, + "completions/min_length": 955.0, + "completions/min_terminated_length": 955.0, + "epoch": 0.89, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4757099764606759, + "kl": 0.000911712646484375, + "learning_rate": 1.3890454406082956e-07, + "loss": 0.0644, + "num_tokens": 1255343.0, + "reward": 0.53125, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.53125, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1968.0, + "completions/mean_length": 2009.625, + "completions/mean_terminated_length": 1894.5, + "completions/min_length": 1821.0, + "completions/min_terminated_length": 1821.0, + "epoch": 0.9, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7840957699122901, + "kl": 0.00072479248046875, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.0212, + "num_tokens": 1272652.0, + "reward": 0.3125, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3125, + "rewards/tag_count_reward/std": 0.1157275140285492, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1859.0, + "completions/mean_length": 1858.25, + "completions/mean_terminated_length": 1542.0, + "completions/min_length": 1311.0, + "completions/min_terminated_length": 1311.0, + "epoch": 0.91, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8165269333115026, + "kl": 0.00063323974609375, + "learning_rate": 1.2713832064634125e-07, + "loss": 0.0456, + "num_tokens": 1289190.0, + "reward": 0.46875, + "reward_std": 0.4317220449447632, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.34375, + "rewards/tag_count_reward/std": 0.12938730418682098, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.92, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6822113621589488, + "kl": 0.000732421875, + "learning_rate": 1.220245676671809e-07, + "loss": 0.0, + "num_tokens": 1307070.0, + "reward": 0.28125, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.28125, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1638.0, + "completions/mean_length": 1996.75, + "completions/mean_terminated_length": 1638.0, + "completions/min_length": 1638.0, + "completions/min_terminated_length": 1638.0, + "epoch": 0.93, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5735308409410357, + "kl": 0.000598907470703125, + "learning_rate": 1.1743223682775649e-07, + "loss": 0.0, + "num_tokens": 1324564.0, + "reward": 0.34375, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.34375, + "rewards/tag_count_reward/std": 0.12938730418682098, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.94, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0035277658572437772, + "kl": 0.00074005126953125, + "learning_rate": 1.1336692317580158e-07, + "loss": 0.0, + "num_tokens": 1342244.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25, + "rewards/tag_count_reward/std": 0.0, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.95, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033309614626830077, + "kl": 0.0007114410400390625, + "learning_rate": 1.0983357966978745e-07, + "loss": 0.0, + "num_tokens": 1360532.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25, + "rewards/tag_count_reward/std": 0.0, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.96, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009616790202442917, + "kl": 0.0008449554443359375, + "learning_rate": 1.068365111445064e-07, + "loss": 0.0, + "num_tokens": 1377964.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25, + "rewards/tag_count_reward/std": 0.0, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1899.0, + "completions/mean_length": 1599.625, + "completions/mean_terminated_length": 1450.166748046875, + "completions/min_length": 1096.0, + "completions/min_terminated_length": 1096.0, + "epoch": 0.97, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6530906969360044, + "kl": 0.0005645751953125, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.0652, + "num_tokens": 1391689.0, + "reward": 0.84375, + "reward_std": 0.5499594211578369, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.46875, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.98, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01164341791205279, + "kl": 0.0011444091796875, + "learning_rate": 1.0246514708427701e-07, + "loss": 0.0, + "num_tokens": 1410297.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25, + "rewards/tag_count_reward/std": 0.0, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.99, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6801784484817373, + "kl": 0.0009212493896484375, + "learning_rate": 1.0109617738307911e-07, + "loss": 0.0, + "num_tokens": 1427513.0, + "reward": 0.40625, + "reward_std": 0.4419417381286621, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.28125, + "rewards/tag_count_reward/std": 0.0883883461356163, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1748.625, + "completions/mean_terminated_length": 1449.25, + "completions/min_length": 915.0, + "completions/min_terminated_length": 915.0, + "epoch": 1.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7939072364235943, + "kl": 0.0008449554443359375, + "learning_rate": 1.002741278414069e-07, + "loss": 0.1071, + "num_tokens": 1442638.0, + "reward": 0.40625, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.40625, + "rewards/tag_count_reward/std": 0.12938730418682098, + "step": 100 + }, + { + "epoch": 1.0, + "step": 100, + "total_flos": 0.0, + "train_loss": 0.0464448650211034, + "train_runtime": 1230.2242, + "train_samples_per_second": 0.081, + "train_steps_per_second": 0.081 + } + ], + "logging_steps": 1, + "max_steps": 100, + "num_input_tokens_seen": 1442638, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..ff11a36 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41d1c79764ffa0817f9c006d98791811df0096b34338a21077119f8f40bbe328 +size 10680