commit d2853dd1bb9d204469bcdd332ceb82af8544f8d0 Author: ModelHub XC Date: Thu Apr 30 05:08:48 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: mimoidochi/OpenRS-GRPO-S Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..44687fd --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +datasets: knoveleng/open-rs +library_name: transformers +model_name: OpenRS-GRPO-S +tags: +- generated_from_trainer +- open-r1 +- trl +- grpo +licence: license +--- + +# Model Card for OpenRS-GRPO-S + +This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) on the [knoveleng/open-rs](https://huggingface.co/datasets/knoveleng/open-rs) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="mimoidochi/OpenRS-GRPO-S", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/vrshy-stanford/huggingface/runs/rr2rdoct) + + +This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300). + +### Framework versions + +- TRL: 0.16.0.dev0 +- Transformers: 4.49.0 +- Pytorch: 2.5.1 +- Datasets: 4.5.0 +- Tokenizers: 0.21.4 + +## Citations + +Cite GRPO as: + +```bibtex +@article{zhihong2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, +} + +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..2425ff9 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": -0.17849066271579692, + "train_runtime": 2375.2815, + "train_samples": 7000, + "train_samples_per_second": 3.536, + "train_steps_per_second": 0.147 +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..7052064 --- /dev/null +++ b/config.json @@ -0,0 +1,30 @@ +{ + "_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": 4096, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.49.0", + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..01dfe4b --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 151646, + "do_sample": true, + "eos_token_id": 151643, + "temperature": 0.6, + "top_p": 0.95, + "transformers_version": "4.49.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..8595e68 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c8ab02c143f78cbc13457f196c7e7e2f4e4d117c212d598acc22791113a7aac +size 3554214752 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..1d385d6 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..e7cd2c1 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a +size 11422959 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..ef6e98c --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,195 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..2425ff9 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": -0.17849066271579692, + "train_runtime": 2375.2815, + "train_samples": 7000, + "train_samples_per_second": 3.536, + "train_steps_per_second": 0.147 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..46ca579 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,4942 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2, + "eval_steps": 500, + "global_step": 350, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 3134.95849609375, + "epoch": 0.0005714285714285715, + "grad_norm": 0.6937011480331421, + "kl": 0.0, + "learning_rate": 2e-08, + "loss": -0.7208, + "reward": 0.27500003203749657, + "reward_std": 0.13869691640138626, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.2916666679084301, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 2868.9583740234375, + "epoch": 0.001142857142857143, + "grad_norm": 1.1340324878692627, + "kl": 0.0, + "learning_rate": 4e-08, + "loss": -0.7386, + "reward": 0.27500003576278687, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.4583333432674408, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 3083.70849609375, + "epoch": 0.0017142857142857142, + "grad_norm": 1.927505373954773, + "kl": 1.7076730728149414e-05, + "learning_rate": 6e-08, + "loss": -2.292, + "reward": 0.3500000163912773, + "reward_std": 0.40609321743249893, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.3333333358168602, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completion_length": 2651.791748046875, + "epoch": 0.002285714285714286, + "grad_norm": 2.0249767303466797, + "kl": 3.701448440551758e-05, + "learning_rate": 8e-08, + "loss": -2.4107, + "reward": 0.40000002086162567, + "reward_std": 0.27739381790161133, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.5833333432674408, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 2761.916748046875, + "epoch": 0.002857142857142857, + "grad_norm": 2.0101120471954346, + "kl": 3.8623809814453125e-05, + "learning_rate": 1e-07, + "loss": -0.725, + "reward": 0.17500000819563866, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.2916666679084301, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 3023.8333740234375, + "epoch": 0.0034285714285714284, + "grad_norm": 0.3757542073726654, + "kl": 3.993511199951172e-05, + "learning_rate": 1.2e-07, + "loss": 0.0, + "reward": 0.15000000596046448, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.25, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 3147.75, + "epoch": 0.004, + "grad_norm": 1.2157344818115234, + "kl": 2.2083520889282227e-05, + "learning_rate": 1.4e-07, + "loss": -1.6438, + "reward": 0.15000000596046448, + "reward_std": 0.19993415474891663, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.2083333432674408, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completion_length": 3028.1251220703125, + "epoch": 0.004571428571428572, + "grad_norm": 0.7169070243835449, + "kl": 3.1948089599609375e-05, + "learning_rate": 1.6e-07, + "loss": -0.9882, + "reward": 0.2750000059604645, + "reward_std": 0.14747881889343262, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.375, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 2304.666748046875, + "epoch": 0.005142857142857143, + "grad_norm": 0.383722186088562, + "kl": 3.325939178466797e-05, + "learning_rate": 1.8e-07, + "loss": 0.0, + "reward": 0.4500000327825546, + "reward_std": 0.12247449159622192, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/format_reward": 0.5, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completion_length": 3193.625, + "epoch": 0.005714285714285714, + "grad_norm": 1.468209147453308, + "kl": 3.314018249511719e-05, + "learning_rate": 2e-07, + "loss": -1.6361, + "reward": 0.22500000894069672, + "reward_std": 0.22493848204612732, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.25, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 2689.916748046875, + "epoch": 0.006285714285714286, + "grad_norm": 1.3963018655776978, + "kl": 5.7578086853027344e-05, + "learning_rate": 2.1999999999999998e-07, + "loss": -0.8156, + "reward": 0.2500000223517418, + "reward_std": 0.1549193412065506, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.3333333358168602, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completion_length": 2627.541748046875, + "epoch": 0.006857142857142857, + "grad_norm": 1.4502061605453491, + "kl": 3.600120544433594e-05, + "learning_rate": 2.4e-07, + "loss": -2.2199, + "reward": 0.5500000715255737, + "reward_std": 0.3433297872543335, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/format_reward": 0.6250000298023224, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 2780.58349609375, + "epoch": 0.0074285714285714285, + "grad_norm": 0.4417027235031128, + "kl": 1.5676021575927734e-05, + "learning_rate": 2.6e-07, + "loss": 0.0, + "reward": 0.45000001788139343, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.5, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completion_length": 2180.0, + "epoch": 0.008, + "grad_norm": 1.337963342666626, + "kl": 4.553794860839844e-05, + "learning_rate": 2.8e-07, + "loss": -1.6775, + "reward": 0.7000000476837158, + "reward_std": 0.19993416219949722, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/format_reward": 0.7083333730697632, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 2745.791748046875, + "epoch": 0.008571428571428572, + "grad_norm": 2.005941390991211, + "kl": 3.337860107421875e-05, + "learning_rate": 3e-07, + "loss": -1.7196, + "reward": 0.2500000149011612, + "reward_std": 0.1549193412065506, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.4166666716337204, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 2953.541748046875, + "epoch": 0.009142857142857144, + "grad_norm": 1.1998287439346313, + "kl": 4.00543212890625e-05, + "learning_rate": 3.2e-07, + "loss": -1.4557, + "reward": 0.42500001192092896, + "reward_std": 0.18371173739433289, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.5000000298023224, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 3183.916748046875, + "epoch": 0.009714285714285713, + "grad_norm": 3.6435375213623047, + "kl": 5.614757537841797e-05, + "learning_rate": 3.4000000000000003e-07, + "loss": -2.4555, + "reward": 0.17500000447034836, + "reward_std": 0.28209254145622253, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.2500000111758709, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completion_length": 3134.75, + "epoch": 0.010285714285714285, + "grad_norm": 0.2575957179069519, + "kl": 5.364418029785156e-05, + "learning_rate": 3.6e-07, + "loss": 0.0, + "reward": 0.15000000596046448, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.25, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 2953.791748046875, + "epoch": 0.010857142857142857, + "grad_norm": 1.4193103313446045, + "kl": 5.936622619628906e-05, + "learning_rate": 3.7999999999999996e-07, + "loss": -0.7348, + "reward": 0.30000003799796104, + "reward_std": 0.12247449159622192, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.2916666679084301, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completion_length": 1856.7083740234375, + "epoch": 0.011428571428571429, + "grad_norm": 0.3330537974834442, + "kl": 2.3543834686279297e-05, + "learning_rate": 4e-07, + "loss": 0.0, + "reward": 0.5500000417232513, + "reward_std": 0.0774596706032753, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.75, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 1932.2501220703125, + "epoch": 0.012, + "grad_norm": 2.0459184646606445, + "kl": 3.30805778503418e-05, + "learning_rate": 4.1999999999999995e-07, + "loss": -1.7473, + "reward": 0.5250000357627869, + "reward_std": 0.24647516012191772, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.75, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completion_length": 3040.416748046875, + "epoch": 0.012571428571428572, + "grad_norm": 2.663806676864624, + "kl": 4.1604042053222656e-05, + "learning_rate": 4.3999999999999997e-07, + "loss": -3.5218, + "reward": 0.6000000238418579, + "reward_std": 0.5641850829124451, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/format_reward": 0.5416666865348816, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 2432.791748046875, + "epoch": 0.013142857142857144, + "grad_norm": 1.2767139673233032, + "kl": 4.3511390686035156e-05, + "learning_rate": 4.6e-07, + "loss": -0.9799, + "reward": 0.4500000327825546, + "reward_std": 0.20871607959270477, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.625, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completion_length": 2564.2083740234375, + "epoch": 0.013714285714285714, + "grad_norm": 0.6577972173690796, + "kl": 4.863739013671875e-05, + "learning_rate": 4.8e-07, + "loss": -0.7097, + "reward": 0.5250000357627869, + "reward_std": 0.13869691640138626, + "rewards/accuracy_reward": 0.3333333358168602, + "rewards/format_reward": 0.5416666865348816, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 1678.4167175292969, + "epoch": 0.014285714285714285, + "grad_norm": 0.5228314995765686, + "kl": 4.172325134277344e-05, + "learning_rate": 5e-07, + "loss": -0.4805, + "reward": 0.6250000298023224, + "reward_std": 0.18371173739433289, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/format_reward": 0.7916666865348816, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 2865.6251220703125, + "epoch": 0.014857142857142857, + "grad_norm": 1.137710690498352, + "kl": 5.5909156799316406e-05, + "learning_rate": 5.2e-07, + "loss": -0.9844, + "reward": 0.3750000149011612, + "reward_std": 0.24647516012191772, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.375, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 3516.375, + "epoch": 0.015428571428571429, + "grad_norm": 2.815162420272827, + "kl": 7.021427154541016e-05, + "learning_rate": 5.4e-07, + "loss": -2.2754, + "reward": 0.1250000111758709, + "reward_std": 0.2479735016822815, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.1666666679084301, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completion_length": 2613.416748046875, + "epoch": 0.016, + "grad_norm": 1.428934931755066, + "kl": 7.724761962890625e-05, + "learning_rate": 5.6e-07, + "loss": -1.9051, + "reward": 0.42500002682209015, + "reward_std": 0.323934830725193, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.5, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 2780.9583740234375, + "epoch": 0.01657142857142857, + "grad_norm": 1.547488808631897, + "kl": 7.271766662597656e-05, + "learning_rate": 5.8e-07, + "loss": -1.3492, + "reward": 0.30000003427267075, + "reward_std": 0.32240864634513855, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.3333333544433117, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completion_length": 3566.375, + "epoch": 0.017142857142857144, + "grad_norm": 1.6621514558792114, + "kl": 0.000118255615234375, + "learning_rate": 6e-07, + "loss": -1.2378, + "reward": 0.15000000596046448, + "reward_std": 0.27739381790161133, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.125, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 2331.5001220703125, + "epoch": 0.017714285714285714, + "grad_norm": 1.5594860315322876, + "kl": 0.00010943412780761719, + "learning_rate": 6.2e-07, + "loss": -1.4703, + "reward": 0.40000002086162567, + "reward_std": 0.2773938253521919, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.458333358168602, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completion_length": 2168.8751220703125, + "epoch": 0.018285714285714287, + "grad_norm": 0.7033916711807251, + "kl": 0.00013780593872070312, + "learning_rate": 6.4e-07, + "loss": -0.678, + "reward": 0.6250000298023224, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.5416666865348816, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 3584.0, + "epoch": 0.018857142857142857, + "grad_norm": 0.45104363560676575, + "kl": 0.0001506805419921875, + "learning_rate": 6.6e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completion_length": 2776.4583740234375, + "epoch": 0.019428571428571427, + "grad_norm": 0.6990403532981873, + "kl": 0.00020122528076171875, + "learning_rate": 6.800000000000001e-07, + "loss": -0.9026, + "reward": 0.5500000044703484, + "reward_std": 0.1549193412065506, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.5833333358168602, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 3161.8333740234375, + "epoch": 0.02, + "grad_norm": 2.173389434814453, + "kl": 0.0002646446228027344, + "learning_rate": 7e-07, + "loss": -1.6205, + "reward": 0.17500000447034836, + "reward_std": 0.13869691640138626, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.291666679084301, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 2875.33349609375, + "epoch": 0.02057142857142857, + "grad_norm": 2.574173927307129, + "kl": 0.0003509521484375, + "learning_rate": 7.2e-07, + "loss": -2.5022, + "reward": 0.30000001192092896, + "reward_std": 0.36425092816352844, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.3333333432674408, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 3408.8333740234375, + "epoch": 0.021142857142857144, + "grad_norm": 1.749144434928894, + "kl": 0.000339508056640625, + "learning_rate": 7.4e-07, + "loss": -2.3526, + "reward": 0.2500000149011612, + "reward_std": 0.40926575660705566, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.2083333358168602, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completion_length": 2987.5001220703125, + "epoch": 0.021714285714285714, + "grad_norm": 1.8244866132736206, + "kl": 0.000728607177734375, + "learning_rate": 7.599999999999999e-07, + "loss": -1.8345, + "reward": 0.30000001192092896, + "reward_std": 0.2683281749486923, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.3333333432674408, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 1947.7083740234375, + "epoch": 0.022285714285714287, + "grad_norm": 1.6700297594070435, + "kl": 0.000614166259765625, + "learning_rate": 7.799999999999999e-07, + "loss": -2.2225, + "reward": 0.8000000715255737, + "reward_std": 0.24494898319244385, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/format_reward": 0.8750000298023224, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completion_length": 3358.75, + "epoch": 0.022857142857142857, + "grad_norm": 0.8734477758407593, + "kl": 0.000514984130859375, + "learning_rate": 8e-07, + "loss": -1.4618, + "reward": 0.10000000894069672, + "reward_std": 0.14339563250541687, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.1666666716337204, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 2962.7083740234375, + "epoch": 0.023428571428571427, + "grad_norm": 1.2087616920471191, + "kl": 0.00079345703125, + "learning_rate": 8.199999999999999e-07, + "loss": -0.7151, + "reward": 0.32500001788139343, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.2916666865348816, + "rewards/format_reward": 0.25, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completion_length": 2246.0834350585938, + "epoch": 0.024, + "grad_norm": 0.39726927876472473, + "kl": 0.0006685256958007812, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0001, + "reward": 0.3500000238418579, + "reward_std": 0.0774596706032753, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.5, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 3212.7501220703125, + "epoch": 0.02457142857142857, + "grad_norm": 1.559288740158081, + "kl": 0.000598907470703125, + "learning_rate": 8.599999999999999e-07, + "loss": -1.7896, + "reward": 0.20000001788139343, + "reward_std": 0.2323790118098259, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.2500000074505806, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completion_length": 3197.5001220703125, + "epoch": 0.025142857142857144, + "grad_norm": 1.7089260816574097, + "kl": 0.0013065338134765625, + "learning_rate": 8.799999999999999e-07, + "loss": -2.2349, + "reward": 0.40000003576278687, + "reward_std": 0.36088940501213074, + "rewards/accuracy_reward": 0.2916666679084301, + "rewards/format_reward": 0.3750000149011612, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 3498.95849609375, + "epoch": 0.025714285714285714, + "grad_norm": 1.1292223930358887, + "kl": 0.0009365081787109375, + "learning_rate": 9e-07, + "loss": -1.8457, + "reward": 0.1250000074505806, + "reward_std": 0.1596180573105812, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.2083333358168602, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 2446.4583740234375, + "epoch": 0.026285714285714287, + "grad_norm": 0.8362674713134766, + "kl": 0.002166748046875, + "learning_rate": 9.2e-07, + "loss": -0.9237, + "reward": 0.5500000417232513, + "reward_std": 0.21162375062704086, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/format_reward": 0.6666666865348816, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 2658.0, + "epoch": 0.026857142857142857, + "grad_norm": 1.1083022356033325, + "kl": 0.001728057861328125, + "learning_rate": 9.399999999999999e-07, + "loss": -1.7675, + "reward": 0.5250000059604645, + "reward_std": 0.3030136823654175, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.5416666865348816, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completion_length": 1924.25, + "epoch": 0.027428571428571427, + "grad_norm": 1.7161272764205933, + "kl": 0.01171112060546875, + "learning_rate": 9.6e-07, + "loss": 0.0019, + "reward": 0.6000000089406967, + "reward_std": 0.22085529565811157, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.75, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 2205.7500610351562, + "epoch": 0.028, + "grad_norm": 1.093712329864502, + "kl": 0.003936767578125, + "learning_rate": 9.8e-07, + "loss": -0.6857, + "reward": 0.4000000022351742, + "reward_std": 0.14339563250541687, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.5416666679084301, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completion_length": 3085.291748046875, + "epoch": 0.02857142857142857, + "grad_norm": 1.334791660308838, + "kl": 0.00530242919921875, + "learning_rate": 1e-06, + "loss": -2.762, + "reward": 0.2750000134110451, + "reward_std": 0.39242780208587646, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.291666679084301, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 2834.2083740234375, + "epoch": 0.029142857142857144, + "grad_norm": 0.6573655605316162, + "kl": 0.0026569366455078125, + "learning_rate": 9.999890338174275e-07, + "loss": 0.0004, + "reward": 0.27500003576278687, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.25, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completion_length": 3296.791748046875, + "epoch": 0.029714285714285714, + "grad_norm": 1.4219310283660889, + "kl": 0.002716064453125, + "learning_rate": 9.999561358041868e-07, + "loss": -2.8224, + "reward": 0.40000003576278687, + "reward_std": 0.38455653190612793, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/format_reward": 0.4166666716337204, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 2921.291748046875, + "epoch": 0.030285714285714287, + "grad_norm": 1.2512001991271973, + "kl": 0.0057525634765625, + "learning_rate": 9.999013075636804e-07, + "loss": 0.0009, + "reward": 0.25, + "reward_std": 0.0774596706032753, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.25, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completion_length": 2937.875, + "epoch": 0.030857142857142857, + "grad_norm": 0.24358271062374115, + "kl": 0.00394439697265625, + "learning_rate": 9.998245517681593e-07, + "loss": 0.0006, + "reward": 0.15000000596046448, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.25, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 3225.041748046875, + "epoch": 0.03142857142857143, + "grad_norm": 0.5058190226554871, + "kl": 0.0050811767578125, + "learning_rate": 9.997258721585931e-07, + "loss": -0.4553, + "reward": 0.30000001192092896, + "reward_std": 0.19993415474891663, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.2916666865348816, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 2791.33349609375, + "epoch": 0.032, + "grad_norm": 1.2000298500061035, + "kl": 0.0060272216796875, + "learning_rate": 9.996052735444862e-07, + "loss": -1.9483, + "reward": 0.30000003799796104, + "reward_std": 0.2173428237438202, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.4583333544433117, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 2855.5, + "epoch": 0.03257142857142857, + "grad_norm": 2.375877618789673, + "kl": 0.007659912109375, + "learning_rate": 9.994627618036452e-07, + "loss": -3.1453, + "reward": 0.40000002086162567, + "reward_std": 0.4469800293445587, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.5000000149011612, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completion_length": 3482.75, + "epoch": 0.03314285714285714, + "grad_norm": 1.0685774087905884, + "kl": 0.005950927734375, + "learning_rate": 9.992983438818915e-07, + "loss": -1.8528, + "reward": 0.1250000111758709, + "reward_std": 0.2479735016822815, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.1666666679084301, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 2283.0834350585938, + "epoch": 0.03371428571428572, + "grad_norm": 1.0396032333374023, + "kl": 0.008758544921875, + "learning_rate": 9.991120277927223e-07, + "loss": -1.6818, + "reward": 0.42500001192092896, + "reward_std": 0.21615658700466156, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.6250000298023224, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completion_length": 3297.45849609375, + "epoch": 0.03428571428571429, + "grad_norm": 1.2367874383926392, + "kl": 0.006866455078125, + "learning_rate": 9.989038226169207e-07, + "loss": -3.072, + "reward": 0.27500002086162567, + "reward_std": 0.3798578232526779, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.3750000149011612, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 2158.541748046875, + "epoch": 0.03485714285714286, + "grad_norm": 0.9891504049301147, + "kl": 0.0059051513671875, + "learning_rate": 9.98673738502114e-07, + "loss": -1.8628, + "reward": 0.7249999940395355, + "reward_std": 0.4160907417535782, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/format_reward": 0.7083333432674408, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completion_length": 2971.291748046875, + "epoch": 0.03542857142857143, + "grad_norm": 0.7076906561851501, + "kl": 0.004241943359375, + "learning_rate": 9.98421786662277e-07, + "loss": -0.5466, + "reward": 0.17500000819563866, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.2916666679084301, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 3438.25, + "epoch": 0.036, + "grad_norm": 0.8365570902824402, + "kl": 0.0075531005859375, + "learning_rate": 9.981479793771866e-07, + "loss": -1.6168, + "reward": 0.17500000447034836, + "reward_std": 0.26995331048965454, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.1666666679084301, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completion_length": 3380.7083740234375, + "epoch": 0.036571428571428574, + "grad_norm": 0.727074384689331, + "kl": 0.00412750244140625, + "learning_rate": 9.97852329991824e-07, + "loss": -0.9846, + "reward": 0.07500000298023224, + "reward_std": 0.08215838670730591, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.125, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 3198.916748046875, + "epoch": 0.037142857142857144, + "grad_norm": 0.9209883809089661, + "kl": 0.0122222900390625, + "learning_rate": 9.975348529157229e-07, + "loss": -1.6992, + "reward": 0.22500000894069672, + "reward_std": 0.21632246673107147, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.2916666716337204, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 2440.08349609375, + "epoch": 0.037714285714285714, + "grad_norm": 1.0546424388885498, + "kl": 0.00701904296875, + "learning_rate": 9.971955636222684e-07, + "loss": -2.3799, + "reward": 0.550000011920929, + "reward_std": 0.3548535108566284, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.7500000298023224, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completion_length": 2116.5, + "epoch": 0.038285714285714284, + "grad_norm": 1.3037738800048828, + "kl": 0.01519775390625, + "learning_rate": 9.968344786479415e-07, + "loss": -1.6571, + "reward": 0.45000001788139343, + "reward_std": 0.24978766590356827, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.625, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completion_length": 3452.666748046875, + "epoch": 0.038857142857142854, + "grad_norm": 0.39050254225730896, + "kl": 0.00258636474609375, + "learning_rate": 9.964516155915151e-07, + "loss": -0.6092, + "reward": 0.07500000298023224, + "reward_std": 0.12549901008605957, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0833333358168602, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 3062.0, + "epoch": 0.03942857142857143, + "grad_norm": 0.89232337474823, + "kl": 0.00434112548828125, + "learning_rate": 9.960469931131936e-07, + "loss": -1.2788, + "reward": 0.30000000447034836, + "reward_std": 0.2658701241016388, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.3333333544433117, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completion_length": 2577.166748046875, + "epoch": 0.04, + "grad_norm": 1.3402986526489258, + "kl": 0.0091552734375, + "learning_rate": 9.956206309337066e-07, + "loss": -1.5284, + "reward": 0.42500000447034836, + "reward_std": 0.20463287830352783, + "rewards/accuracy_reward": 0.2916666679084301, + "rewards/format_reward": 0.4166666679084301, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 1986.875, + "epoch": 0.04057142857142857, + "grad_norm": 1.0082634687423706, + "kl": 0.004364013671875, + "learning_rate": 9.951725498333448e-07, + "loss": -1.6703, + "reward": 0.6500000059604645, + "reward_std": 0.19993415474891663, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/format_reward": 0.625, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completion_length": 2902.125, + "epoch": 0.04114285714285714, + "grad_norm": 0.5259437561035156, + "kl": 0.008880615234375, + "learning_rate": 9.947027716509488e-07, + "loss": -0.8052, + "reward": 0.3500000163912773, + "reward_std": 0.0774596706032753, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.3333333358168602, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 2200.7500610351562, + "epoch": 0.04171428571428572, + "grad_norm": 0.7980517745018005, + "kl": 0.01031494140625, + "learning_rate": 9.942113192828444e-07, + "loss": -0.9314, + "reward": 0.5, + "reward_std": 0.1549193412065506, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.6666666865348816, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completion_length": 2764.2083740234375, + "epoch": 0.04228571428571429, + "grad_norm": 1.3613859415054321, + "kl": 0.00328826904296875, + "learning_rate": 9.93698216681727e-07, + "loss": -1.641, + "reward": 0.3500000163912773, + "reward_std": 0.28679126501083374, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.4166666679084301, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completion_length": 2020.166748046875, + "epoch": 0.04285714285714286, + "grad_norm": 1.677713394165039, + "kl": 0.015411376953125, + "learning_rate": 9.931634888554935e-07, + "loss": -1.892, + "reward": 0.5500000715255737, + "reward_std": 0.29662763327360153, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.7916666865348816, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 3099.291748046875, + "epoch": 0.04342857142857143, + "grad_norm": 0.7256157398223877, + "kl": 0.0051727294921875, + "learning_rate": 9.926071618660237e-07, + "loss": -1.579, + "reward": 0.2750000059604645, + "reward_std": 0.22555401921272278, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.3333333432674408, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 3111.20849609375, + "epoch": 0.044, + "grad_norm": 0.5698821544647217, + "kl": 0.011016845703125, + "learning_rate": 9.9202926282791e-07, + "loss": 0.0018, + "reward": 0.42500004172325134, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.4583333432674408, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completion_length": 2611.541748046875, + "epoch": 0.044571428571428574, + "grad_norm": 1.2385872602462769, + "kl": 0.009613037109375, + "learning_rate": 9.91429819907136e-07, + "loss": -2.4486, + "reward": 0.5750000178813934, + "reward_std": 0.42487265169620514, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/format_reward": 0.5416666716337204, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completion_length": 2208.3751220703125, + "epoch": 0.045142857142857144, + "grad_norm": 0.9444563388824463, + "kl": 0.011566162109375, + "learning_rate": 9.908088623197048e-07, + "loss": -2.1262, + "reward": 0.550000011920929, + "reward_std": 0.29662764072418213, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/format_reward": 0.6666666865348816, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completion_length": 3514.791748046875, + "epoch": 0.045714285714285714, + "grad_norm": 0.6574345231056213, + "kl": 0.0064697265625, + "learning_rate": 9.901664203302124e-07, + "loss": -0.9746, + "reward": 0.07500000298023224, + "reward_std": 0.08215838670730591, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.125, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 2035.7501220703125, + "epoch": 0.046285714285714284, + "grad_norm": 1.1667371988296509, + "kl": 0.02191162109375, + "learning_rate": 9.895025252503755e-07, + "loss": -1.7981, + "reward": 0.6500000059604645, + "reward_std": 0.28908342123031616, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/format_reward": 0.8333333730697632, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completion_length": 2879.2083740234375, + "epoch": 0.046857142857142854, + "grad_norm": 0.5121353268623352, + "kl": 0.009796142578125, + "learning_rate": 9.888172094375033e-07, + "loss": -0.8266, + "reward": 0.25, + "reward_std": 0.1549193412065506, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.3333333432674408, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completion_length": 1211.4166870117188, + "epoch": 0.04742857142857143, + "grad_norm": 0.9632378816604614, + "kl": 0.01214599609375, + "learning_rate": 9.881105062929221e-07, + "loss": -0.5763, + "reward": 0.7250000536441803, + "reward_std": 0.13869691640138626, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/format_reward": 0.7916666865348816, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completion_length": 1922.916748046875, + "epoch": 0.048, + "grad_norm": 1.0117182731628418, + "kl": 0.009674072265625, + "learning_rate": 9.873824502603459e-07, + "loss": -0.7207, + "reward": 0.6000000536441803, + "reward_std": 0.27253396064043045, + "rewards/accuracy_reward": 0.291666679084301, + "rewards/format_reward": 0.7083333432674408, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 2529.875, + "epoch": 0.04857142857142857, + "grad_norm": 1.2297358512878418, + "kl": 0.023101806640625, + "learning_rate": 9.866330768241983e-07, + "loss": -1.8915, + "reward": 0.4000000059604645, + "reward_std": 0.22963720560073853, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.5416666865348816, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 2450.291748046875, + "epoch": 0.04914285714285714, + "grad_norm": 0.9087566137313843, + "kl": 0.014007568359375, + "learning_rate": 9.85862422507884e-07, + "loss": -1.8448, + "reward": 0.8500000536441803, + "reward_std": 0.464758038520813, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/format_reward": 0.8333333730697632, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 2870.5, + "epoch": 0.04971428571428571, + "grad_norm": 0.9272903800010681, + "kl": 0.01702880859375, + "learning_rate": 9.850705248720068e-07, + "loss": -0.4444, + "reward": 0.32500001415610313, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.2916666679084301, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completion_length": 2377.8333740234375, + "epoch": 0.05028571428571429, + "grad_norm": 0.9736530184745789, + "kl": 0.013763427734375, + "learning_rate": 9.8425742251254e-07, + "loss": -1.3592, + "reward": 0.45000001788139343, + "reward_std": 0.30921074748039246, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.6250000298023224, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 2418.791748046875, + "epoch": 0.05085714285714286, + "grad_norm": 0.9435445666313171, + "kl": 0.023223876953125, + "learning_rate": 9.83423155058946e-07, + "loss": -1.928, + "reward": 0.3500000238418579, + "reward_std": 0.26902148127555847, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.4583333432674408, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completion_length": 2293.8751220703125, + "epoch": 0.05142857142857143, + "grad_norm": 1.0847179889678955, + "kl": 0.0203857421875, + "learning_rate": 9.825677631722435e-07, + "loss": -1.8569, + "reward": 0.8250000774860382, + "reward_std": 0.42503853142261505, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/format_reward": 0.8333333730697632, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 3049.3751220703125, + "epoch": 0.052, + "grad_norm": 0.5584085583686829, + "kl": 0.016754150390625, + "learning_rate": 9.816912885430258e-07, + "loss": -0.8024, + "reward": 0.40000002086162567, + "reward_std": 0.0774596706032753, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.4166666716337204, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completion_length": 2802.0, + "epoch": 0.052571428571428575, + "grad_norm": 0.9023773670196533, + "kl": 0.0162353515625, + "learning_rate": 9.807937738894303e-07, + "loss": 0.0026, + "reward": 0.30000001192092896, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.25, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 2122.5, + "epoch": 0.053142857142857144, + "grad_norm": 1.0342284440994263, + "kl": 0.016387939453125, + "learning_rate": 9.798752629550546e-07, + "loss": 0.0026, + "reward": 0.42500004172325134, + "reward_std": 0.13869690895080566, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.5, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completion_length": 2639.2501220703125, + "epoch": 0.053714285714285714, + "grad_norm": 0.929490864276886, + "kl": 0.014007568359375, + "learning_rate": 9.78935800506826e-07, + "loss": -2.0397, + "reward": 0.5250000059604645, + "reward_std": 0.2479735016822815, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.7083333730697632, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 1750.0833740234375, + "epoch": 0.054285714285714284, + "grad_norm": 1.3795406818389893, + "kl": 0.01446533203125, + "learning_rate": 9.779754323328192e-07, + "loss": -2.9239, + "reward": 0.675000011920929, + "reward_std": 0.39452049136161804, + "rewards/accuracy_reward": 0.291666679084301, + "rewards/format_reward": 0.8333333730697632, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 3076.70849609375, + "epoch": 0.054857142857142854, + "grad_norm": 0.8867998123168945, + "kl": 0.01177978515625, + "learning_rate": 9.769942052400235e-07, + "loss": -1.3267, + "reward": 0.2500000223517418, + "reward_std": 0.24494898319244385, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.3333333544433117, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 2068.1250610351562, + "epoch": 0.05542857142857143, + "grad_norm": 1.0344054698944092, + "kl": 0.0184326171875, + "learning_rate": 9.759921670520634e-07, + "loss": -0.6738, + "reward": 0.3749999962747097, + "reward_std": 0.13869691640138626, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.5416666679084301, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completion_length": 1832.9584350585938, + "epoch": 0.056, + "grad_norm": 0.7039546966552734, + "kl": 0.0093536376953125, + "learning_rate": 9.749693666068663e-07, + "loss": -1.7297, + "reward": 0.7250000536441803, + "reward_std": 0.31179559230804443, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/format_reward": 0.75, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 2053.041748046875, + "epoch": 0.05657142857142857, + "grad_norm": 1.1954854726791382, + "kl": 0.008026123046875, + "learning_rate": 9.739258537542835e-07, + "loss": -1.1949, + "reward": 0.6500000357627869, + "reward_std": 0.3548534959554672, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/format_reward": 0.6250000298023224, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completion_length": 1900.1250610351562, + "epoch": 0.05714285714285714, + "grad_norm": 0.692438542842865, + "kl": 0.008209228515625, + "learning_rate": 9.728616793536587e-07, + "loss": -0.8441, + "reward": 0.6750000715255737, + "reward_std": 0.26995329558849335, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/format_reward": 0.625, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 2909.0833740234375, + "epoch": 0.05771428571428571, + "grad_norm": 0.7268219590187073, + "kl": 0.0091552734375, + "learning_rate": 9.717768952713511e-07, + "loss": -1.6243, + "reward": 0.30000001192092896, + "reward_std": 0.2323790192604065, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.3333333432674408, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completion_length": 2342.25, + "epoch": 0.05828571428571429, + "grad_norm": 1.311353087425232, + "kl": 0.0206298828125, + "learning_rate": 9.706715543782064e-07, + "loss": -2.3993, + "reward": 0.42500004172325134, + "reward_std": 0.3704479932785034, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.5, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 2729.7916870117188, + "epoch": 0.05885714285714286, + "grad_norm": 1.082575798034668, + "kl": 0.0152587890625, + "learning_rate": 9.695457105469804e-07, + "loss": -2.3919, + "reward": 0.2500000111758709, + "reward_std": 0.2861757278442383, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.2916666679084301, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completion_length": 2986.5833740234375, + "epoch": 0.05942857142857143, + "grad_norm": 0.5590086579322815, + "kl": 0.011871337890625, + "learning_rate": 9.683994186497132e-07, + "loss": -0.5574, + "reward": 0.30000003427267075, + "reward_std": 0.19993416219949722, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/format_reward": 0.2916666679084301, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 2627.33349609375, + "epoch": 0.06, + "grad_norm": 0.7722799777984619, + "kl": 0.02337646484375, + "learning_rate": 9.672327345550543e-07, + "loss": 0.0037, + "reward": 0.32500001788139343, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.5, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 2862.666748046875, + "epoch": 0.060571428571428575, + "grad_norm": 0.46922969818115234, + "kl": 0.009735107421875, + "learning_rate": 9.66045715125541e-07, + "loss": -0.7062, + "reward": 0.27500003576278687, + "reward_std": 0.20463287830352783, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.2916666865348816, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 2396.3750610351562, + "epoch": 0.061142857142857145, + "grad_norm": 0.8894286155700684, + "kl": 0.01702880859375, + "learning_rate": 9.648384182148252e-07, + "loss": -1.4031, + "reward": 0.3499999940395355, + "reward_std": 0.1741531491279602, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.5000000298023224, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completion_length": 2107.0833740234375, + "epoch": 0.061714285714285715, + "grad_norm": 0.5590862035751343, + "kl": 0.0107421875, + "learning_rate": 9.636109026648554e-07, + "loss": -0.6467, + "reward": 0.32500000298023224, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.5416666865348816, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 3016.291748046875, + "epoch": 0.062285714285714285, + "grad_norm": 0.4933023750782013, + "kl": 0.0123291015625, + "learning_rate": 9.623632283030077e-07, + "loss": 0.002, + "reward": 0.17499999701976776, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.25, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completion_length": 2778.0001220703125, + "epoch": 0.06285714285714286, + "grad_norm": 0.7055474519729614, + "kl": 0.024871826171875, + "learning_rate": 9.610954559391704e-07, + "loss": -0.7942, + "reward": 0.2500000223517418, + "reward_std": 0.1549193412065506, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.3333333358168602, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 2845.041748046875, + "epoch": 0.06342857142857143, + "grad_norm": 2.253298044204712, + "kl": 0.01910400390625, + "learning_rate": 9.598076473627796e-07, + "loss": 0.003, + "reward": 0.2250000238418579, + "reward_std": 0.08215838670730591, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.25, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completion_length": 2531.916748046875, + "epoch": 0.064, + "grad_norm": 0.9246645569801331, + "kl": 0.015045166015625, + "learning_rate": 9.58499865339809e-07, + "loss": -2.463, + "reward": 0.3750000298023224, + "reward_std": 0.28209254145622253, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.5000000298023224, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 2310.4583740234375, + "epoch": 0.06457142857142857, + "grad_norm": 0.46783214807510376, + "kl": 0.01202392578125, + "learning_rate": 9.571721736097088e-07, + "loss": -0.7241, + "reward": 0.3750000149011612, + "reward_std": 0.17702671885490417, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.4583333432674408, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completion_length": 3185.666748046875, + "epoch": 0.06514285714285714, + "grad_norm": 0.66249018907547, + "kl": 0.01556396484375, + "learning_rate": 9.55824636882301e-07, + "loss": -0.9383, + "reward": 0.10000000894069672, + "reward_std": 0.0774596706032753, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.1666666716337204, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 2467.2501220703125, + "epoch": 0.06571428571428571, + "grad_norm": 0.9964971542358398, + "kl": 0.0130615234375, + "learning_rate": 9.54457320834625e-07, + "loss": -2.1875, + "reward": 0.32500001788139343, + "reward_std": 0.2611714005470276, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.4583333432674408, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 2428.5416870117188, + "epoch": 0.06628571428571428, + "grad_norm": 0.472149521112442, + "kl": 0.01055908203125, + "learning_rate": 9.530702921077358e-07, + "loss": 0.0017, + "reward": 0.32500001788139343, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.5, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 2962.041748046875, + "epoch": 0.06685714285714285, + "grad_norm": 1.044438362121582, + "kl": 0.0174560546875, + "learning_rate": 9.516636183034564e-07, + "loss": -2.2953, + "reward": 0.5999999940395355, + "reward_std": 0.44171059131622314, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/format_reward": 0.5000000298023224, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completion_length": 1725.3333740234375, + "epoch": 0.06742857142857143, + "grad_norm": 0.6704514622688293, + "kl": 0.013427734375, + "learning_rate": 9.502373679810839e-07, + "loss": -0.7163, + "reward": 0.6250000298023224, + "reward_std": 0.11291590332984924, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.7083333432674408, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 2067.8333740234375, + "epoch": 0.068, + "grad_norm": 0.798595666885376, + "kl": 0.014404296875, + "learning_rate": 9.487916106540465e-07, + "loss": 0.0023, + "reward": 0.6000000238418579, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.5, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completion_length": 2834.75, + "epoch": 0.06857142857142857, + "grad_norm": 0.5755884051322937, + "kl": 0.01953125, + "learning_rate": 9.473264167865171e-07, + "loss": -0.8146, + "reward": 0.25, + "reward_std": 0.1549193412065506, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.3333333432674408, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 2840.291748046875, + "epoch": 0.06914285714285714, + "grad_norm": 0.5671223402023315, + "kl": 0.018829345703125, + "learning_rate": 9.458418577899774e-07, + "loss": -0.7081, + "reward": 0.2750000022351742, + "reward_std": 0.13869691640138626, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.2916666679084301, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completion_length": 1911.1251220703125, + "epoch": 0.06971428571428571, + "grad_norm": 0.6135214567184448, + "kl": 0.014678955078125, + "learning_rate": 9.443380060197385e-07, + "loss": 0.0024, + "reward": 0.7750000357627869, + "reward_std": 0.13869690895080566, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/format_reward": 0.75, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 2615.7083740234375, + "epoch": 0.07028571428571428, + "grad_norm": 1.535902976989746, + "kl": 0.019317626953125, + "learning_rate": 9.428149347714143e-07, + "loss": -3.4492, + "reward": 0.5000000447034836, + "reward_std": 0.49277445673942566, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.5000000298023224, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completion_length": 2879.6251220703125, + "epoch": 0.07085714285714285, + "grad_norm": 1.1739275455474854, + "kl": 0.016326904296875, + "learning_rate": 9.412727182773486e-07, + "loss": -2.466, + "reward": 0.5, + "reward_std": 0.22085529565811157, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.5833333432674408, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 1907.7916870117188, + "epoch": 0.07142857142857142, + "grad_norm": 0.5758523344993591, + "kl": 0.016204833984375, + "learning_rate": 9.397114317029974e-07, + "loss": -0.8541, + "reward": 0.5750000476837158, + "reward_std": 0.19540132582187653, + "rewards/accuracy_reward": 0.291666679084301, + "rewards/format_reward": 0.6666666716337204, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 1292.2916870117188, + "epoch": 0.072, + "grad_norm": 0.6998382806777954, + "kl": 0.009063720703125, + "learning_rate": 9.381311511432658e-07, + "loss": -0.7275, + "reward": 0.7500000298023224, + "reward_std": 0.16431677341461182, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 2668.916748046875, + "epoch": 0.07257142857142856, + "grad_norm": 0.6192981004714966, + "kl": 0.0121612548828125, + "learning_rate": 9.36531953618799e-07, + "loss": -0.9637, + "reward": 0.3750000149011612, + "reward_std": 0.08215838670730591, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.375, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completion_length": 2330.33349609375, + "epoch": 0.07314285714285715, + "grad_norm": 0.850170373916626, + "kl": 0.0172271728515625, + "learning_rate": 9.34913917072228e-07, + "loss": -1.662, + "reward": 0.5500000417232513, + "reward_std": 0.1741531491279602, + "rewards/accuracy_reward": 0.2916666865348816, + "rewards/format_reward": 0.6250000298023224, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 1855.9166870117188, + "epoch": 0.07371428571428572, + "grad_norm": 0.6127024292945862, + "kl": 0.013641357421875, + "learning_rate": 9.332771203643714e-07, + "loss": -0.7422, + "reward": 0.5500000417232513, + "reward_std": 0.12247449159622192, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.7083333432674408, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completion_length": 1596.4583740234375, + "epoch": 0.07428571428571429, + "grad_norm": 0.6304543614387512, + "kl": 0.01348876953125, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0022, + "reward": 0.6500000655651093, + "reward_std": 0.24177644401788712, + "rewards/accuracy_reward": 0.3333333358168602, + "rewards/format_reward": 0.75, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 2883.291748046875, + "epoch": 0.07485714285714286, + "grad_norm": 0.4680456817150116, + "kl": 0.01495361328125, + "learning_rate": 9.299475664759068e-07, + "loss": 0.0024, + "reward": 0.15000000596046448, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.25, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completion_length": 2198.375, + "epoch": 0.07542857142857143, + "grad_norm": 0.583177924156189, + "kl": 0.013580322265625, + "learning_rate": 9.282549715730579e-07, + "loss": -1.4613, + "reward": 0.5750000476837158, + "reward_std": 0.15610557794570923, + "rewards/accuracy_reward": 0.2916666679084301, + "rewards/format_reward": 0.6666666865348816, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 2586.6666870117188, + "epoch": 0.076, + "grad_norm": 0.4416232705116272, + "kl": 0.0152587890625, + "learning_rate": 9.265439410565328e-07, + "loss": -0.7425, + "reward": 0.4750000238418579, + "reward_std": 0.20463287830352783, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.4583333432674408, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completion_length": 2783.4583740234375, + "epoch": 0.07657142857142857, + "grad_norm": 0.8242610096931458, + "kl": 0.01690673828125, + "learning_rate": 9.248145583195447e-07, + "loss": -1.5028, + "reward": 0.3500000163912773, + "reward_std": 0.24494898319244385, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.3750000223517418, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 2785.666748046875, + "epoch": 0.07714285714285714, + "grad_norm": 0.8124563097953796, + "kl": 0.02239990234375, + "learning_rate": 9.230669076497687e-07, + "loss": -0.6555, + "reward": 0.19999999925494194, + "reward_std": 0.12247449159622192, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.2916666679084301, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 1172.4583740234375, + "epoch": 0.07771428571428571, + "grad_norm": 1.0723152160644531, + "kl": 0.018798828125, + "learning_rate": 9.213010742252327e-07, + "loss": -0.9192, + "reward": 0.675000011920929, + "reward_std": 0.2611714005470276, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/format_reward": 0.875, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 2302.041748046875, + "epoch": 0.07828571428571429, + "grad_norm": 0.9255122542381287, + "kl": 0.0070953369140625, + "learning_rate": 9.195171441101668e-07, + "loss": -2.5899, + "reward": 0.42500001192092896, + "reward_std": 0.4010545611381531, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/format_reward": 0.5, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completion_length": 2176.9583740234375, + "epoch": 0.07885714285714286, + "grad_norm": 0.5417965650558472, + "kl": 0.016082763671875, + "learning_rate": 9.177152042508077e-07, + "loss": 0.0026, + "reward": 0.45000001788139343, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.5, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 2479.8333740234375, + "epoch": 0.07942857142857143, + "grad_norm": 0.8476680517196655, + "kl": 0.019317626953125, + "learning_rate": 9.158953424711624e-07, + "loss": -1.9513, + "reward": 0.30000003799796104, + "reward_std": 0.2173428237438202, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.4583333544433117, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completion_length": 1950.8334350585938, + "epoch": 0.08, + "grad_norm": 0.32817330956459045, + "kl": 0.0100860595703125, + "learning_rate": 9.140576474687263e-07, + "loss": -0.7432, + "reward": 0.5750000178813934, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.7083333432674408, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 1543.7500610351562, + "epoch": 0.08057142857142857, + "grad_norm": 0.5753952264785767, + "kl": 0.01080322265625, + "learning_rate": 9.122022088101613e-07, + "loss": 0.0017, + "reward": 0.4750000089406967, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.75, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completion_length": 2146.2083740234375, + "epoch": 0.08114285714285714, + "grad_norm": 0.7607588171958923, + "kl": 0.01971435546875, + "learning_rate": 9.103291169269299e-07, + "loss": -1.4222, + "reward": 0.4750000238418579, + "reward_std": 0.280418336391449, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.5416666865348816, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 2653.3751220703125, + "epoch": 0.08171428571428571, + "grad_norm": 1.6966173648834229, + "kl": 0.0238037109375, + "learning_rate": 9.084384631108882e-07, + "loss": -1.9226, + "reward": 0.45000001788139343, + "reward_std": 0.2773938328027725, + "rewards/accuracy_reward": 0.3333333358168602, + "rewards/format_reward": 0.4166666865348816, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completion_length": 1880.166748046875, + "epoch": 0.08228571428571428, + "grad_norm": 1.1052340269088745, + "kl": 0.018768310546875, + "learning_rate": 9.065303395098358e-07, + "loss": -0.9172, + "reward": 0.7750000357627869, + "reward_std": 0.22493848204612732, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/format_reward": 0.875, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 886.25, + "epoch": 0.08285714285714285, + "grad_norm": 0.5123775601387024, + "kl": 0.016632080078125, + "learning_rate": 9.046048391230247e-07, + "loss": 0.0027, + "reward": 0.6500000059604645, + "reward_std": 0.0774596706032753, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 1.0, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 1539.8333435058594, + "epoch": 0.08342857142857144, + "grad_norm": 0.5727400779724121, + "kl": 0.01708984375, + "learning_rate": 9.026620557966279e-07, + "loss": 0.0027, + "reward": 0.6500000357627869, + "reward_std": 0.22085530310869217, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.75, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 1381.3750610351562, + "epoch": 0.084, + "grad_norm": 0.8086219429969788, + "kl": 0.01629638671875, + "learning_rate": 9.007020842191634e-07, + "loss": -1.4881, + "reward": 0.7750000357627869, + "reward_std": 0.31787581741809845, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/format_reward": 0.8750000298023224, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completion_length": 1451.5, + "epoch": 0.08457142857142858, + "grad_norm": 0.8584139347076416, + "kl": 0.01898193359375, + "learning_rate": 8.987250199168808e-07, + "loss": -0.9353, + "reward": 0.550000011920929, + "reward_std": 0.1549193412065506, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.8333333432674408, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 1420.4583740234375, + "epoch": 0.08514285714285715, + "grad_norm": 0.8129308223724365, + "kl": 0.014434814453125, + "learning_rate": 8.967309592491052e-07, + "loss": -1.3771, + "reward": 0.6000000536441803, + "reward_std": 0.19993415474891663, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/format_reward": 0.75, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completion_length": 2162.666748046875, + "epoch": 0.08571428571428572, + "grad_norm": 0.889790952205658, + "kl": 0.02252197265625, + "learning_rate": 8.9471999940354e-07, + "loss": -0.58, + "reward": 0.42500004172325134, + "reward_std": 0.20463287830352783, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.5416666865348816, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 2218.7916870117188, + "epoch": 0.08628571428571429, + "grad_norm": 0.5901851654052734, + "kl": 0.013671875, + "learning_rate": 8.926922383915315e-07, + "loss": -0.7892, + "reward": 0.5500000268220901, + "reward_std": 0.22085529565811157, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.5833333432674408, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completion_length": 2912.916748046875, + "epoch": 0.08685714285714285, + "grad_norm": 0.5517586469650269, + "kl": 0.021484375, + "learning_rate": 8.906477750432903e-07, + "loss": -0.891, + "reward": 0.3250000327825546, + "reward_std": 0.1596180573105812, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.3333333432674408, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 2069.916748046875, + "epoch": 0.08742857142857142, + "grad_norm": 0.6771623492240906, + "kl": 0.015380859375, + "learning_rate": 8.88586709003076e-07, + "loss": -1.2495, + "reward": 0.4750000238418579, + "reward_std": 0.15610557794570923, + "rewards/accuracy_reward": 0.2916666679084301, + "rewards/format_reward": 0.5, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completion_length": 2867.5833740234375, + "epoch": 0.088, + "grad_norm": 0.6926563382148743, + "kl": 0.017333984375, + "learning_rate": 8.865091407243394e-07, + "loss": -1.5406, + "reward": 0.27500002086162567, + "reward_std": 0.2479735016822815, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.291666679084301, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 3541.0833740234375, + "epoch": 0.08857142857142856, + "grad_norm": 0.6008081436157227, + "kl": 0.01416015625, + "learning_rate": 8.844151714648274e-07, + "loss": -0.6494, + "reward": 0.07500000298023224, + "reward_std": 0.12549901008605957, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0416666679084301, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 1990.041748046875, + "epoch": 0.08914285714285715, + "grad_norm": 1.1144965887069702, + "kl": 0.01812744140625, + "learning_rate": 8.823049032816478e-07, + "loss": -2.2955, + "reward": 0.675000011920929, + "reward_std": 0.3254331648349762, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/format_reward": 0.7500000298023224, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 2252.8333740234375, + "epoch": 0.08971428571428572, + "grad_norm": 0.5644240379333496, + "kl": 0.012451171875, + "learning_rate": 8.801784390262943e-07, + "loss": 0.002, + "reward": 0.3750000298023224, + "reward_std": 0.13869691640138626, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.5, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completion_length": 1755.791748046875, + "epoch": 0.09028571428571429, + "grad_norm": 0.8909981846809387, + "kl": 0.0125732421875, + "learning_rate": 8.780358823396352e-07, + "loss": -1.3986, + "reward": 0.7000000476837158, + "reward_std": 0.38667041063308716, + "rewards/accuracy_reward": 0.3750000223517418, + "rewards/format_reward": 0.7916666865348816, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 3373.541748046875, + "epoch": 0.09085714285714286, + "grad_norm": 0.568712055683136, + "kl": 0.016937255859375, + "learning_rate": 8.758773376468604e-07, + "loss": -0.678, + "reward": 0.05000000447034836, + "reward_std": 0.0774596706032753, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0833333358168602, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completion_length": 3461.0833740234375, + "epoch": 0.09142857142857143, + "grad_norm": 0.5345960855484009, + "kl": 0.01690673828125, + "learning_rate": 8.737029101523929e-07, + "loss": -1.3097, + "reward": 0.07500000298023224, + "reward_std": 0.18371173739433289, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0833333358168602, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 2223.291748046875, + "epoch": 0.092, + "grad_norm": 0.8153612017631531, + "kl": 0.0274658203125, + "learning_rate": 8.715127058347614e-07, + "loss": -0.9097, + "reward": 0.6000000089406967, + "reward_std": 0.32863354682922363, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.625, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completion_length": 2307.875, + "epoch": 0.09257142857142857, + "grad_norm": 0.5529870986938477, + "kl": 0.013397216796875, + "learning_rate": 8.693068314414344e-07, + "loss": -1.2837, + "reward": 0.5000000447034836, + "reward_std": 0.2773938328027725, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/format_reward": 0.4583333432674408, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 2981.0, + "epoch": 0.09314285714285714, + "grad_norm": 0.8973040580749512, + "kl": 0.0225830078125, + "learning_rate": 8.670853944836176e-07, + "loss": -1.6263, + "reward": 0.20000001788139343, + "reward_std": 0.21162375807762146, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.25, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completion_length": 2735.875, + "epoch": 0.09371428571428571, + "grad_norm": 1.581978678703308, + "kl": 0.016082763671875, + "learning_rate": 8.648485032310144e-07, + "loss": -2.4628, + "reward": 0.42500001192092896, + "reward_std": 0.35955221951007843, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.5833333432674408, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 3489.4583740234375, + "epoch": 0.09428571428571429, + "grad_norm": 1.0414494276046753, + "kl": 0.0177001953125, + "learning_rate": 8.625962667065487e-07, + "loss": -2.4063, + "reward": 0.15000000223517418, + "reward_std": 0.26419591903686523, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.2083333395421505, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 3584.0, + "epoch": 0.09485714285714286, + "grad_norm": 0.3711743652820587, + "kl": 0.015045166015625, + "learning_rate": 8.603287946810513e-07, + "loss": -0.6281, + "reward": 0.05000000447034836, + "reward_std": 0.12247449159622192, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0416666679084301, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 1534.9584350585938, + "epoch": 0.09542857142857143, + "grad_norm": 0.7632970809936523, + "kl": 0.02166748046875, + "learning_rate": 8.580461976679099e-07, + "loss": -0.8458, + "reward": 0.8250000774860382, + "reward_std": 0.29361626505851746, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/format_reward": 0.9166666865348816, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completion_length": 2715.8333740234375, + "epoch": 0.096, + "grad_norm": 0.774446427822113, + "kl": 0.02288818359375, + "learning_rate": 8.557485869176825e-07, + "loss": -1.9958, + "reward": 0.40000003576278687, + "reward_std": 0.29662763327360153, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.4583333432674408, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 1717.4166870117188, + "epoch": 0.09657142857142857, + "grad_norm": 0.6996132135391235, + "kl": 0.01715087890625, + "learning_rate": 8.534360744126753e-07, + "loss": -1.6658, + "reward": 0.5500000268220901, + "reward_std": 0.29662764072418213, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.7916666865348816, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completion_length": 3413.9583740234375, + "epoch": 0.09714285714285714, + "grad_norm": 0.6789911389350891, + "kl": 0.02880859375, + "learning_rate": 8.511087728614862e-07, + "loss": -1.5654, + "reward": 0.1250000111758709, + "reward_std": 0.2611714005470276, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.1666666679084301, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 2726.70849609375, + "epoch": 0.09771428571428571, + "grad_norm": 0.3861570954322815, + "kl": 0.01446533203125, + "learning_rate": 8.487667956935087e-07, + "loss": 0.0023, + "reward": 0.4000000059604645, + "reward_std": 0.0774596706032753, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.5, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completion_length": 2290.6251220703125, + "epoch": 0.09828571428571428, + "grad_norm": 0.9113690257072449, + "kl": 0.0257568359375, + "learning_rate": 8.464102570534061e-07, + "loss": -0.7408, + "reward": 0.2750000059604645, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.4583333432674408, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 1820.666748046875, + "epoch": 0.09885714285714285, + "grad_norm": 0.9667136073112488, + "kl": 0.02313232421875, + "learning_rate": 8.440392717955475e-07, + "loss": -1.6142, + "reward": 0.4750000238418579, + "reward_std": 0.26889464259147644, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.7083333432674408, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completion_length": 3118.0, + "epoch": 0.09942857142857142, + "grad_norm": 0.8733185529708862, + "kl": 0.0316162109375, + "learning_rate": 8.416539554784089e-07, + "loss": -1.6321, + "reward": 0.1250000111758709, + "reward_std": 0.13869691640138626, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.2083333395421505, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 1706.4583435058594, + "epoch": 0.1, + "grad_norm": 0.45300906896591187, + "kl": 0.012939453125, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0021, + "reward": 0.8000000566244125, + "reward_std": 0.14339563250541687, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/format_reward": 0.7083333432674408, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 1727.5417175292969, + "epoch": 0.10057142857142858, + "grad_norm": 0.659142255783081, + "kl": 0.021392822265625, + "learning_rate": 8.368407953869103e-07, + "loss": 0.0034, + "reward": 0.6000000536441803, + "reward_std": 0.12247449159622192, + "rewards/accuracy_reward": 0.2916666679084301, + "rewards/format_reward": 0.7083333432674408, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 2159.25, + "epoch": 0.10114285714285715, + "grad_norm": 0.8017681837081909, + "kl": 0.013885498046875, + "learning_rate": 8.344131861991828e-07, + "loss": 0.0022, + "reward": 0.4750000238418579, + "reward_std": 0.13869690895080566, + "rewards/accuracy_reward": 0.2916666865348816, + "rewards/format_reward": 0.5, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completion_length": 3375.9583740234375, + "epoch": 0.10171428571428572, + "grad_norm": 1.2313588857650757, + "kl": 0.015167236328125, + "learning_rate": 8.319717151140072e-07, + "loss": -2.2319, + "reward": 0.17500000447034836, + "reward_std": 0.3254331722855568, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.2083333358168602, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 3116.625, + "epoch": 0.10228571428571429, + "grad_norm": 0.6798207759857178, + "kl": 0.02423095703125, + "learning_rate": 8.295165011252396e-07, + "loss": -1.4557, + "reward": 0.2750000059604645, + "reward_std": 0.26783522963523865, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.2916666865348816, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completion_length": 2302.25, + "epoch": 0.10285714285714286, + "grad_norm": 0.5794097781181335, + "kl": 0.022064208984375, + "learning_rate": 8.270476638965461e-07, + "loss": -0.532, + "reward": 0.5000000298023224, + "reward_std": 0.12247449159622192, + "rewards/accuracy_reward": 0.3333333358168602, + "rewards/format_reward": 0.5, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 2662.5833740234375, + "epoch": 0.10342857142857143, + "grad_norm": 0.6577425599098206, + "kl": 0.019775390625, + "learning_rate": 8.245653237555705e-07, + "loss": -1.5816, + "reward": 0.4500000402331352, + "reward_std": 0.19993416219949722, + "rewards/accuracy_reward": 0.2916666679084301, + "rewards/format_reward": 0.4583333544433117, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completion_length": 2937.2501220703125, + "epoch": 0.104, + "grad_norm": 0.8629750609397888, + "kl": 0.02593994140625, + "learning_rate": 8.220696016880687e-07, + "loss": -1.5823, + "reward": 0.15000001341104507, + "reward_std": 0.1549193412065506, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.2500000074505806, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 2815.2916870117188, + "epoch": 0.10457142857142857, + "grad_norm": 0.6297455430030823, + "kl": 0.02032470703125, + "learning_rate": 8.195606193320136e-07, + "loss": -0.8378, + "reward": 0.27500003576278687, + "reward_std": 0.11291590332984924, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.4166666865348816, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completion_length": 1781.041748046875, + "epoch": 0.10514285714285715, + "grad_norm": 0.5706174969673157, + "kl": 0.012176513671875, + "learning_rate": 8.170384989716657e-07, + "loss": -0.743, + "reward": 0.7000000476837158, + "reward_std": 0.12247449159622192, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/format_reward": 0.7083333432674408, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 1956.2500610351562, + "epoch": 0.10571428571428572, + "grad_norm": 0.8781810402870178, + "kl": 0.0162353515625, + "learning_rate": 8.145033635316128e-07, + "loss": -1.4382, + "reward": 0.45000001788139343, + "reward_std": 0.18973666429519653, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.6666666865348816, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 2789.2501220703125, + "epoch": 0.10628571428571429, + "grad_norm": 0.7274031639099121, + "kl": 0.01470947265625, + "learning_rate": 8.119553365707802e-07, + "loss": -1.4622, + "reward": 0.4500000476837158, + "reward_std": 0.36425093561410904, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.4166666716337204, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 3284.791748046875, + "epoch": 0.10685714285714286, + "grad_norm": 0.7924415469169617, + "kl": 0.03204345703125, + "learning_rate": 8.093945422764069e-07, + "loss": -1.1243, + "reward": 0.07500000298023224, + "reward_std": 0.13869690895080566, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.125, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completion_length": 2522.9583740234375, + "epoch": 0.10742857142857143, + "grad_norm": 0.8641192317008972, + "kl": 0.02154541015625, + "learning_rate": 8.068211054579943e-07, + "loss": -1.8636, + "reward": 0.6000000536441803, + "reward_std": 0.3966957926750183, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/format_reward": 0.5416666865348816, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 3382.625, + "epoch": 0.108, + "grad_norm": 1.1794185638427734, + "kl": 0.0162353515625, + "learning_rate": 8.04235151541222e-07, + "loss": -2.2379, + "reward": 0.20000001043081284, + "reward_std": 0.32240865379571915, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.2500000074505806, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completion_length": 2990.625, + "epoch": 0.10857142857142857, + "grad_norm": 0.6209798455238342, + "kl": 0.0179443359375, + "learning_rate": 8.01636806561836e-07, + "loss": -0.7416, + "reward": 0.125, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.2083333432674408, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 2806.8333740234375, + "epoch": 0.10914285714285714, + "grad_norm": 1.071854591369629, + "kl": 0.03564453125, + "learning_rate": 7.990261971595048e-07, + "loss": -1.7873, + "reward": 0.25, + "reward_std": 0.21162375807762146, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.3333333432674408, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completion_length": 2162.95849609375, + "epoch": 0.10971428571428571, + "grad_norm": 0.6578007340431213, + "kl": 0.02581787109375, + "learning_rate": 7.964034505716476e-07, + "loss": -0.6846, + "reward": 0.550000011920929, + "reward_std": 0.14339563250541687, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.5416666865348816, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 3155.791748046875, + "epoch": 0.11028571428571429, + "grad_norm": 0.5719695687294006, + "kl": 0.0224609375, + "learning_rate": 7.93768694627233e-07, + "loss": -1.403, + "reward": 0.10000000521540642, + "reward_std": 0.14339563250541687, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.1666666679084301, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completion_length": 1952.041748046875, + "epoch": 0.11085714285714286, + "grad_norm": 1.1899938583374023, + "kl": 0.037109375, + "learning_rate": 7.911220577405484e-07, + "loss": -1.389, + "reward": 0.6500000357627869, + "reward_std": 0.37408730387687683, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.7083333730697632, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 2549.4583740234375, + "epoch": 0.11142857142857143, + "grad_norm": 0.7450345754623413, + "kl": 0.0228271484375, + "learning_rate": 7.884636689049422e-07, + "loss": -1.3458, + "reward": 0.3999999985098839, + "reward_std": 0.2658701241016388, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.5000000111758709, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 2487.7916870117188, + "epoch": 0.112, + "grad_norm": 0.5231258869171143, + "kl": 0.01495361328125, + "learning_rate": 7.857936576865356e-07, + "loss": -0.9285, + "reward": 0.375, + "reward_std": 0.08215838670730591, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.375, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 2426.3334350585938, + "epoch": 0.11257142857142857, + "grad_norm": 0.8786201477050781, + "kl": 0.0322265625, + "learning_rate": 7.831121542179086e-07, + "loss": -1.3101, + "reward": 0.42500003799796104, + "reward_std": 0.13869691640138626, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.4583333544433117, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completion_length": 1954.1250610351562, + "epoch": 0.11314285714285714, + "grad_norm": 0.8747029900550842, + "kl": 0.02593994140625, + "learning_rate": 7.804192891917571e-07, + "loss": -1.4407, + "reward": 0.6000000536441803, + "reward_std": 0.3504374995827675, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.6666666716337204, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 2655.416748046875, + "epoch": 0.11371428571428571, + "grad_norm": 0.8738974928855896, + "kl": 0.02545166015625, + "learning_rate": 7.777151938545235e-07, + "loss": -1.3506, + "reward": 0.20000000298023224, + "reward_std": 0.18673625588417053, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.291666679084301, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completion_length": 2649.4583740234375, + "epoch": 0.11428571428571428, + "grad_norm": 0.7848045229911804, + "kl": 0.026123046875, + "learning_rate": 7.75e-07, + "loss": -1.6035, + "reward": 0.32500001788139343, + "reward_std": 0.26995331048965454, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.4166666716337204, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 2324.0833740234375, + "epoch": 0.11485714285714285, + "grad_norm": 0.6355379819869995, + "kl": 0.017059326171875, + "learning_rate": 7.72273839962904e-07, + "loss": -0.7416, + "reward": 0.30000001192092896, + "reward_std": 0.12247449159622192, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.4583333432674408, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completion_length": 2468.916748046875, + "epoch": 0.11542857142857142, + "grad_norm": 0.7434844374656677, + "kl": 0.017822265625, + "learning_rate": 7.695368466124296e-07, + "loss": 0.0029, + "reward": 0.40000003576278687, + "reward_std": 0.0774596706032753, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.5, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 2735.2916870117188, + "epoch": 0.116, + "grad_norm": 0.4409310221672058, + "kl": 0.01507568359375, + "learning_rate": 7.667891533457718e-07, + "loss": -1.4004, + "reward": 0.30000000447034836, + "reward_std": 0.2658701241016388, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.4166666679084301, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completion_length": 2205.7501220703125, + "epoch": 0.11657142857142858, + "grad_norm": 0.5194834470748901, + "kl": 0.012176513671875, + "learning_rate": 7.640308940816239e-07, + "loss": 0.002, + "reward": 0.30000001192092896, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.5, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 2431.0, + "epoch": 0.11714285714285715, + "grad_norm": 0.5818977355957031, + "kl": 0.018218994140625, + "learning_rate": 7.612622032536507e-07, + "loss": -0.722, + "reward": 0.375, + "reward_std": 0.17702671885490417, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.4583333432674408, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 2947.8751220703125, + "epoch": 0.11771428571428572, + "grad_norm": 1.0808711051940918, + "kl": 0.02557373046875, + "learning_rate": 7.584832158039378e-07, + "loss": -2.5238, + "reward": 0.20000001788139343, + "reward_std": 0.2323790118098259, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.3333333432674408, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completion_length": 2864.2501220703125, + "epoch": 0.11828571428571429, + "grad_norm": 0.9076454639434814, + "kl": 0.0179443359375, + "learning_rate": 7.556940671764124e-07, + "loss": -1.2846, + "reward": 0.27500002086162567, + "reward_std": 0.2611714079976082, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.3333333358168602, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completion_length": 2238.541748046875, + "epoch": 0.11885714285714286, + "grad_norm": 0.5197967886924744, + "kl": 0.0123291015625, + "learning_rate": 7.528948933102438e-07, + "loss": -0.7262, + "reward": 0.6250000298023224, + "reward_std": 0.18371173739433289, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.5416666865348816, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 2626.291748046875, + "epoch": 0.11942857142857143, + "grad_norm": 0.5860099196434021, + "kl": 0.012420654296875, + "learning_rate": 7.500858306332172e-07, + "loss": -1.2149, + "reward": 0.3750000149011612, + "reward_std": 0.13869690895080566, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.375, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completion_length": 2280.5000610351562, + "epoch": 0.12, + "grad_norm": 1.0006970167160034, + "kl": 0.02508544921875, + "learning_rate": 7.472670160550848e-07, + "loss": -1.5695, + "reward": 0.4000000059604645, + "reward_std": 0.17232800275087357, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.6250000298023224, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 1863.8750610351562, + "epoch": 0.12057142857142857, + "grad_norm": 0.7675609588623047, + "kl": 0.013702392578125, + "learning_rate": 7.444385869608921e-07, + "loss": -0.7409, + "reward": 0.6500000357627869, + "reward_std": 0.2658701241016388, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/format_reward": 0.7083333432674408, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completion_length": 3500.916748046875, + "epoch": 0.12114285714285715, + "grad_norm": 1.411064863204956, + "kl": 0.02862548828125, + "learning_rate": 7.416006812042827e-07, + "loss": -2.3867, + "reward": 0.22500000149011612, + "reward_std": 0.3480285108089447, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.2083333395421505, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 1511.2500610351562, + "epoch": 0.12171428571428572, + "grad_norm": 0.5711240768432617, + "kl": 0.01910400390625, + "learning_rate": 7.387534371007797e-07, + "loss": 0.0031, + "reward": 0.5250000059604645, + "reward_std": 0.13869690895080566, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.75, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completion_length": 2068.9584350585938, + "epoch": 0.12228571428571429, + "grad_norm": 0.3741142153739929, + "kl": 0.0205078125, + "learning_rate": 7.358969934210438e-07, + "loss": -0.5851, + "reward": 0.32500000298023224, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.5416666865348816, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completion_length": 2977.791748046875, + "epoch": 0.12285714285714286, + "grad_norm": 1.0709601640701294, + "kl": 0.02203369140625, + "learning_rate": 7.330314893841101e-07, + "loss": -2.8587, + "reward": 0.30000002682209015, + "reward_std": 0.32240864634513855, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.375, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 885.9583435058594, + "epoch": 0.12342857142857143, + "grad_norm": 0.8305730223655701, + "kl": 0.015655517578125, + "learning_rate": 7.301570646506027e-07, + "loss": -0.8887, + "reward": 1.0250000655651093, + "reward_std": 0.11291590332984924, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/format_reward": 0.9166666865348816, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 3584.0, + "epoch": 0.124, + "grad_norm": 0.4252244532108307, + "kl": 0.0159912109375, + "learning_rate": 7.27273859315928e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completion_length": 2363.666748046875, + "epoch": 0.12457142857142857, + "grad_norm": 0.8345184922218323, + "kl": 0.0247802734375, + "learning_rate": 7.243820139034464e-07, + "loss": -1.5168, + "reward": 0.40000003576278687, + "reward_std": 0.2323790118098259, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.5, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completion_length": 838.2083740234375, + "epoch": 0.12514285714285714, + "grad_norm": 1.1001908779144287, + "kl": 0.01165771484375, + "learning_rate": 7.214816693576234e-07, + "loss": 0.0019, + "reward": 0.9750000536441803, + "reward_std": 0.13869690895080566, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completion_length": 2691.5833740234375, + "epoch": 0.12571428571428572, + "grad_norm": 0.48478028178215027, + "kl": 0.017822265625, + "learning_rate": 7.185729670371604e-07, + "loss": -0.9277, + "reward": 0.5, + "reward_std": 0.0774596706032753, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.5833333432674408, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 1311.3333740234375, + "epoch": 0.12628571428571428, + "grad_norm": 0.8523876667022705, + "kl": 0.015106201171875, + "learning_rate": 7.156560487081051e-07, + "loss": 0.0024, + "reward": 0.7500000298023224, + "reward_std": 0.16431677341461182, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.75, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completion_length": 1624.291748046875, + "epoch": 0.12685714285714286, + "grad_norm": 0.6330533623695374, + "kl": 0.02435302734375, + "learning_rate": 7.127310565369415e-07, + "loss": -0.4816, + "reward": 0.675000011920929, + "reward_std": 0.3061862289905548, + "rewards/accuracy_reward": 0.3333333358168602, + "rewards/format_reward": 0.7916666865348816, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completion_length": 2770.0, + "epoch": 0.12742857142857142, + "grad_norm": 0.8254171013832092, + "kl": 0.0244140625, + "learning_rate": 7.097981330836616e-07, + "loss": -1.1907, + "reward": 0.2500000111758709, + "reward_std": 0.24494898319244385, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.3333333544433117, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completion_length": 2910.3751220703125, + "epoch": 0.128, + "grad_norm": 0.570878267288208, + "kl": 0.0177001953125, + "learning_rate": 7.068574212948169e-07, + "loss": -0.8357, + "reward": 0.4500000476837158, + "reward_std": 0.2323790118098259, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.4166666716337204, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 2329.0001220703125, + "epoch": 0.12857142857142856, + "grad_norm": 0.5944868326187134, + "kl": 0.0166015625, + "learning_rate": 7.039090644965509e-07, + "loss": -1.557, + "reward": 0.4500000402331352, + "reward_std": 0.19993416219949722, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.541666679084301, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 1627.3334350585938, + "epoch": 0.12914285714285714, + "grad_norm": 0.7897810935974121, + "kl": 0.019805908203125, + "learning_rate": 7.009532063876148e-07, + "loss": -0.7406, + "reward": 0.6000000238418579, + "reward_std": 0.12247449159622192, + "rewards/accuracy_reward": 0.2916666865348816, + "rewards/format_reward": 0.7083333432674408, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completion_length": 1878.7083740234375, + "epoch": 0.12971428571428573, + "grad_norm": 0.5080549716949463, + "kl": 0.01611328125, + "learning_rate": 6.979899910323624e-07, + "loss": -0.7196, + "reward": 0.6500000357627869, + "reward_std": 0.12247449159622192, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.7083333432674408, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completion_length": 1201.9167175292969, + "epoch": 0.13028571428571428, + "grad_norm": 0.7728170156478882, + "kl": 0.0109405517578125, + "learning_rate": 6.950195628537299e-07, + "loss": -0.6648, + "reward": 0.7500000596046448, + "reward_std": 0.22085529565811157, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/format_reward": 0.75, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 1906.3750610351562, + "epoch": 0.13085714285714287, + "grad_norm": 0.6207183003425598, + "kl": 0.015899658203125, + "learning_rate": 6.920420666261961e-07, + "loss": -1.5599, + "reward": 0.5250000357627869, + "reward_std": 0.20463287830352783, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.625, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completion_length": 2654.2084350585938, + "epoch": 0.13142857142857142, + "grad_norm": 0.7562258243560791, + "kl": 0.01849365234375, + "learning_rate": 6.890576474687263e-07, + "loss": -2.2566, + "reward": 0.5000000298023224, + "reward_std": 0.3759405389428139, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.6250000298023224, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 2595.666748046875, + "epoch": 0.132, + "grad_norm": 0.6036232709884644, + "kl": 0.024169921875, + "learning_rate": 6.860664508377001e-07, + "loss": -1.7953, + "reward": 0.3250000402331352, + "reward_std": 0.2370777204632759, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.416666679084301, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completion_length": 2626.5001220703125, + "epoch": 0.13257142857142856, + "grad_norm": 1.021782398223877, + "kl": 0.035400390625, + "learning_rate": 6.83068622519821e-07, + "loss": -1.0878, + "reward": 0.27500003203749657, + "reward_std": 0.20463287830352783, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.3333333544433117, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 3502.3333740234375, + "epoch": 0.13314285714285715, + "grad_norm": 0.8032714128494263, + "kl": 0.02203369140625, + "learning_rate": 6.800643086250121e-07, + "loss": -1.0137, + "reward": 0.05000000447034836, + "reward_std": 0.12247449159622192, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0833333358168602, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completion_length": 2883.375, + "epoch": 0.1337142857142857, + "grad_norm": 0.8170682787895203, + "kl": 0.02398681640625, + "learning_rate": 6.770536555792944e-07, + "loss": -1.9184, + "reward": 0.2500000149011612, + "reward_std": 0.22963719069957733, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.2916666716337204, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completion_length": 1954.541748046875, + "epoch": 0.13428571428571429, + "grad_norm": 0.7082000970840454, + "kl": 0.01861572265625, + "learning_rate": 6.740368101176495e-07, + "loss": -1.5973, + "reward": 0.5500000417232513, + "reward_std": 0.1741531491279602, + "rewards/accuracy_reward": 0.3333333544433117, + "rewards/format_reward": 0.583333358168602, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 1402.4166870117188, + "epoch": 0.13485714285714287, + "grad_norm": 0.6959501504898071, + "kl": 0.018798828125, + "learning_rate": 6.710139192768694e-07, + "loss": -0.742, + "reward": 0.7250000536441803, + "reward_std": 0.18371173739433289, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/format_reward": 0.9583333432674408, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 1551.0834350585938, + "epoch": 0.13542857142857143, + "grad_norm": 1.2837129831314087, + "kl": 0.018768310546875, + "learning_rate": 6.679851303883891e-07, + "loss": -1.248, + "reward": 0.45000001788139343, + "reward_std": 0.12247449159622192, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.75, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completion_length": 2601.291748046875, + "epoch": 0.136, + "grad_norm": 0.6121569275856018, + "kl": 0.02252197265625, + "learning_rate": 6.649505910711058e-07, + "loss": -0.9744, + "reward": 0.32500000298023224, + "reward_std": 0.22555401921272278, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.375, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completion_length": 2471.7500610351562, + "epoch": 0.13657142857142857, + "grad_norm": 0.7961202263832092, + "kl": 0.02349853515625, + "learning_rate": 6.619104492241847e-07, + "loss": -1.697, + "reward": 0.5, + "reward_std": 0.3340982347726822, + "rewards/accuracy_reward": 0.291666679084301, + "rewards/format_reward": 0.5416666716337204, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completion_length": 2069.291748046875, + "epoch": 0.13714285714285715, + "grad_norm": 26.108394622802734, + "kl": 0.1033935546875, + "learning_rate": 6.588648530198504e-07, + "loss": -1.4725, + "reward": 0.40000002086162567, + "reward_std": 0.17232800275087357, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.6250000149011612, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 2281.0833435058594, + "epoch": 0.1377142857142857, + "grad_norm": 0.651018500328064, + "kl": 0.02239990234375, + "learning_rate": 6.558139508961654e-07, + "loss": -0.6258, + "reward": 0.6000000759959221, + "reward_std": 0.19993416219949722, + "rewards/accuracy_reward": 0.4583333544433117, + "rewards/format_reward": 0.5416666679084301, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completion_length": 2965.625, + "epoch": 0.1382857142857143, + "grad_norm": 0.9381749033927917, + "kl": 0.02398681640625, + "learning_rate": 6.527578915497951e-07, + "loss": -1.8083, + "reward": 0.15000001341104507, + "reward_std": 0.1549193412065506, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.2500000074505806, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completion_length": 2690.166748046875, + "epoch": 0.13885714285714285, + "grad_norm": 0.735163152217865, + "kl": 0.02117919921875, + "learning_rate": 6.496968239287603e-07, + "loss": -1.3621, + "reward": 0.2250000163912773, + "reward_std": 0.13869691640138626, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.3750000223517418, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completion_length": 2612.5416870117188, + "epoch": 0.13942857142857143, + "grad_norm": 0.8661508560180664, + "kl": 0.0235595703125, + "learning_rate": 6.466308972251785e-07, + "loss": -1.5249, + "reward": 0.30000001192092896, + "reward_std": 0.2323790192604065, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.4166666716337204, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 2336.8751220703125, + "epoch": 0.14, + "grad_norm": 0.9766954779624939, + "kl": 0.019775390625, + "learning_rate": 6.435602608679916e-07, + "loss": -1.3257, + "reward": 0.3500000238418579, + "reward_std": 0.2566385716199875, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.4166666716337204, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 2764.25, + "epoch": 0.14057142857142857, + "grad_norm": 0.5228642821311951, + "kl": 0.01849365234375, + "learning_rate": 6.404850645156841e-07, + "loss": -0.5755, + "reward": 0.2250000163912773, + "reward_std": 0.18371173739433289, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.2916666679084301, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completion_length": 3086.0001220703125, + "epoch": 0.14114285714285715, + "grad_norm": 0.7841210961341858, + "kl": 0.023162841796875, + "learning_rate": 6.374054580489873e-07, + "loss": -2.1551, + "reward": 0.45000001788139343, + "reward_std": 0.39986832439899445, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.4166666716337204, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completion_length": 1750.7083740234375, + "epoch": 0.1417142857142857, + "grad_norm": 0.7208341360092163, + "kl": 0.017120361328125, + "learning_rate": 6.343215915635761e-07, + "loss": -0.7416, + "reward": 0.7250000536441803, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.7083333432674408, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 2371.8751220703125, + "epoch": 0.1422857142857143, + "grad_norm": 0.6523035168647766, + "kl": 0.018218994140625, + "learning_rate": 6.31233615362752e-07, + "loss": -0.7417, + "reward": 0.2750000059604645, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.4583333432674408, + "step": 249 + }, + { + "clip_ratio": 0.0, + "completion_length": 2791.416748046875, + "epoch": 0.14285714285714285, + "grad_norm": 0.7076215147972107, + "kl": 0.01898193359375, + "learning_rate": 6.281416799501187e-07, + "loss": -0.4655, + "reward": 0.32500001415610313, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.2916666679084301, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 1733.0000610351562, + "epoch": 0.14342857142857143, + "grad_norm": 0.5396320819854736, + "kl": 0.0194091796875, + "learning_rate": 6.25045936022246e-07, + "loss": -0.7209, + "reward": 0.5500000417232513, + "reward_std": 0.17232800275087357, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/format_reward": 0.7083333432674408, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completion_length": 2229.7501220703125, + "epoch": 0.144, + "grad_norm": 1.1370456218719482, + "kl": 0.029296875, + "learning_rate": 6.219465344613258e-07, + "loss": -2.3579, + "reward": 0.30000002682209015, + "reward_std": 0.24978766590356827, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.458333358168602, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 2835.25, + "epoch": 0.14457142857142857, + "grad_norm": 0.5109323859214783, + "kl": 0.0166015625, + "learning_rate": 6.188436263278172e-07, + "loss": 0.0027, + "reward": 0.17500001192092896, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.25, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completion_length": 2688.6250610351562, + "epoch": 0.14514285714285713, + "grad_norm": 0.5289558172225952, + "kl": 0.01513671875, + "learning_rate": 6.157373628530852e-07, + "loss": -1.8877, + "reward": 0.40000002086162567, + "reward_std": 0.36425092816352844, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.5, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completion_length": 2283.25, + "epoch": 0.1457142857142857, + "grad_norm": 0.7051854729652405, + "kl": 0.01953125, + "learning_rate": 6.126278954320294e-07, + "loss": -1.7114, + "reward": 0.5, + "reward_std": 0.21162375062704086, + "rewards/accuracy_reward": 0.3333333358168602, + "rewards/format_reward": 0.5000000149011612, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 3230.7501220703125, + "epoch": 0.1462857142857143, + "grad_norm": 0.6201116442680359, + "kl": 0.0255126953125, + "learning_rate": 6.095153756157051e-07, + "loss": -1.4431, + "reward": 0.10000000894069672, + "reward_std": 0.1549193412065506, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.1666666716337204, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 2076.9166870117188, + "epoch": 0.14685714285714285, + "grad_norm": 0.9742373824119568, + "kl": 0.02349853515625, + "learning_rate": 6.06399955103937e-07, + "loss": -1.6298, + "reward": 0.5000000298023224, + "reward_std": 0.27253394573926926, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/format_reward": 0.5833333432674408, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completion_length": 1520.791748046875, + "epoch": 0.14742857142857144, + "grad_norm": 0.6934832334518433, + "kl": 0.011260986328125, + "learning_rate": 6.032817857379256e-07, + "loss": -0.7432, + "reward": 0.5250000357627869, + "reward_std": 0.13869691640138626, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.7083333432674408, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completion_length": 2571.0, + "epoch": 0.148, + "grad_norm": 0.5018502473831177, + "kl": 0.01641845703125, + "learning_rate": 6.001610194928464e-07, + "loss": -0.8253, + "reward": 0.27500003576278687, + "reward_std": 0.14747881889343262, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.375, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completion_length": 3117.416748046875, + "epoch": 0.14857142857142858, + "grad_norm": 134.84226989746094, + "kl": 1.43408203125, + "learning_rate": 5.97037808470444e-07, + "loss": -1.1063, + "reward": 0.22500000894069672, + "reward_std": 0.18371173739433289, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.25, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 3584.0, + "epoch": 0.14914285714285713, + "grad_norm": 0.3973945677280426, + "kl": 0.0257568359375, + "learning_rate": 5.939123048916173e-07, + "loss": -0.5713, + "reward": 0.02500000223517418, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0416666679084301, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completion_length": 2326.916748046875, + "epoch": 0.14971428571428572, + "grad_norm": 0.5301626324653625, + "kl": 0.019775390625, + "learning_rate": 5.907846610890011e-07, + "loss": -0.7393, + "reward": 0.4500000476837158, + "reward_std": 0.1549193412065506, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/format_reward": 0.5416666865348816, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completion_length": 2577.666748046875, + "epoch": 0.15028571428571427, + "grad_norm": 0.6672086715698242, + "kl": 0.02447509765625, + "learning_rate": 5.87655029499542e-07, + "loss": -0.7189, + "reward": 0.32500000298023224, + "reward_std": 0.1596180573105812, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.375, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completion_length": 3584.0, + "epoch": 0.15085714285714286, + "grad_norm": 0.40429234504699707, + "kl": 0.02337646484375, + "learning_rate": 5.845235626570683e-07, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 3360.25, + "epoch": 0.15142857142857144, + "grad_norm": 0.6446682214736938, + "kl": 0.025390625, + "learning_rate": 5.813904131848564e-07, + "loss": -0.616, + "reward": 0.05000000447034836, + "reward_std": 0.0774596706032753, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0833333358168602, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 2176.416748046875, + "epoch": 0.152, + "grad_norm": 0.7718228101730347, + "kl": 0.031494140625, + "learning_rate": 5.78255733788191e-07, + "loss": -0.4889, + "reward": 0.3750000149011612, + "reward_std": 0.18371173739433289, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.5416666865348816, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completion_length": 1419.5416870117188, + "epoch": 0.15257142857142858, + "grad_norm": 0.5885297656059265, + "kl": 0.018798828125, + "learning_rate": 5.751196772469237e-07, + "loss": 0.003, + "reward": 0.5500000417232513, + "reward_std": 0.1549193412065506, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.75, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completion_length": 2091.0000610351562, + "epoch": 0.15314285714285714, + "grad_norm": 0.7558290958404541, + "kl": 0.01812744140625, + "learning_rate": 5.71982396408026e-07, + "loss": -0.5488, + "reward": 0.45000001788139343, + "reward_std": 0.12247449159622192, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.5416666865348816, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 3082.3333740234375, + "epoch": 0.15371428571428572, + "grad_norm": 0.9141106009483337, + "kl": 0.021484375, + "learning_rate": 5.688440441781398e-07, + "loss": -2.2151, + "reward": 0.15000001341104507, + "reward_std": 0.2323790118098259, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.2500000074505806, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completion_length": 1514.1666870117188, + "epoch": 0.15428571428571428, + "grad_norm": 0.8448731899261475, + "kl": 0.01568603515625, + "learning_rate": 5.657047735161255e-07, + "loss": -0.7411, + "reward": 0.675000011920929, + "reward_std": 0.13869690895080566, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/format_reward": 0.7083333432674408, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 1341.916748046875, + "epoch": 0.15485714285714286, + "grad_norm": 1.3789052963256836, + "kl": 0.020538330078125, + "learning_rate": 5.625647374256061e-07, + "loss": -0.7409, + "reward": 0.6500000357627869, + "reward_std": 0.19993415474891663, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.9583333432674408, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completion_length": 2583.7083740234375, + "epoch": 0.15542857142857142, + "grad_norm": 1.2276079654693604, + "kl": 0.0244140625, + "learning_rate": 5.594240889475106e-07, + "loss": -2.3455, + "reward": 0.40000002086162567, + "reward_std": 0.36425092816352844, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.5, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 2224.0, + "epoch": 0.156, + "grad_norm": 0.45050886273384094, + "kl": 0.022003173828125, + "learning_rate": 5.562829811526154e-07, + "loss": 0.0035, + "reward": 0.45000001788139343, + "reward_std": 0.1549193412065506, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/format_reward": 0.5, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completion_length": 2945.0833740234375, + "epoch": 0.15657142857142858, + "grad_norm": 0.7557133436203003, + "kl": 0.01849365234375, + "learning_rate": 5.531415671340826e-07, + "loss": 0.003, + "reward": 0.2250000238418579, + "reward_std": 0.08215838670730591, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.25, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completion_length": 1650.8333740234375, + "epoch": 0.15714285714285714, + "grad_norm": 0.675412654876709, + "kl": 0.014434814453125, + "learning_rate": 5.5e-07, + "loss": -0.7416, + "reward": 0.5000000149011612, + "reward_std": 0.19993415474891663, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.7083333432674408, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 1545.3334350585938, + "epoch": 0.15771428571428572, + "grad_norm": 0.7588775157928467, + "kl": 0.018646240234375, + "learning_rate": 5.468584328659172e-07, + "loss": -1.4864, + "reward": 0.9750000238418579, + "reward_std": 0.18371173739433289, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/format_reward": 0.9166666865348816, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 2915.791748046875, + "epoch": 0.15828571428571428, + "grad_norm": 1.0028012990951538, + "kl": 0.0211181640625, + "learning_rate": 5.437170188473847e-07, + "loss": -1.5921, + "reward": 0.3499999940395355, + "reward_std": 0.26039472222328186, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.375, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completion_length": 2256.25, + "epoch": 0.15885714285714286, + "grad_norm": 0.644159734249115, + "kl": 0.015411376953125, + "learning_rate": 5.405759110524894e-07, + "loss": 0.0025, + "reward": 0.40000003576278687, + "reward_std": 0.14339563250541687, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.5, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completion_length": 2127.9584350585938, + "epoch": 0.15942857142857142, + "grad_norm": 1.0713709592819214, + "kl": 0.0250244140625, + "learning_rate": 5.37435262574394e-07, + "loss": -2.5405, + "reward": 0.5500000268220901, + "reward_std": 0.2861757129430771, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.75, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completion_length": 2087.2083740234375, + "epoch": 0.16, + "grad_norm": 0.44367942214012146, + "kl": 0.014678955078125, + "learning_rate": 5.342952264838747e-07, + "loss": -0.7211, + "reward": 0.5000000298023224, + "reward_std": 0.14339563250541687, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.625, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 1533.0417175292969, + "epoch": 0.16057142857142856, + "grad_norm": 1.1346826553344727, + "kl": 0.015472412109375, + "learning_rate": 5.311559558218603e-07, + "loss": -0.7422, + "reward": 0.5500000417232513, + "reward_std": 0.12247449159622192, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.6666666865348816, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completion_length": 2707.9166870117188, + "epoch": 0.16114285714285714, + "grad_norm": 0.7678596377372742, + "kl": 0.0263671875, + "learning_rate": 5.28017603591974e-07, + "loss": -0.7124, + "reward": 0.30000001192092896, + "reward_std": 0.20871606469154358, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.375, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completion_length": 2104.8333740234375, + "epoch": 0.16171428571428573, + "grad_norm": 0.3849073052406311, + "kl": 0.01458740234375, + "learning_rate": 5.248803227530763e-07, + "loss": 0.0023, + "reward": 0.32500000298023224, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.5, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completion_length": 2547.9166870117188, + "epoch": 0.16228571428571428, + "grad_norm": 1.1764817237854004, + "kl": 0.01708984375, + "learning_rate": 5.21744266211809e-07, + "loss": -2.3522, + "reward": 0.5750000178813934, + "reward_std": 0.3804733455181122, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/format_reward": 0.5000000298023224, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 2855.8333740234375, + "epoch": 0.16285714285714287, + "grad_norm": 0.8596624135971069, + "kl": 0.0228271484375, + "learning_rate": 5.186095868151436e-07, + "loss": -1.918, + "reward": 0.2500000111758709, + "reward_std": 0.2773938328027725, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.3333333544433117, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 2617.541748046875, + "epoch": 0.16342857142857142, + "grad_norm": 1.1564656496047974, + "kl": 0.0211181640625, + "learning_rate": 5.154764373429315e-07, + "loss": -2.6721, + "reward": 0.6750000417232513, + "reward_std": 0.37711599469184875, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/format_reward": 0.625, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completion_length": 1809.9584350585938, + "epoch": 0.164, + "grad_norm": 0.8912317156791687, + "kl": 0.017974853515625, + "learning_rate": 5.123449705004581e-07, + "loss": -1.9258, + "reward": 0.5750000476837158, + "reward_std": 0.3480285108089447, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/format_reward": 0.7083333432674408, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completion_length": 2473.6666870117188, + "epoch": 0.16457142857142856, + "grad_norm": 0.6116588115692139, + "kl": 0.01898193359375, + "learning_rate": 5.09215338910999e-07, + "loss": -0.8855, + "reward": 0.375, + "reward_std": 0.13869690895080566, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.4166666865348816, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 1470.0416870117188, + "epoch": 0.16514285714285715, + "grad_norm": 0.5574433207511902, + "kl": 0.017059326171875, + "learning_rate": 5.060876951083828e-07, + "loss": -0.7664, + "reward": 0.6250000298023224, + "reward_std": 0.21615658700466156, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.9166666865348816, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completion_length": 1707.041748046875, + "epoch": 0.1657142857142857, + "grad_norm": 1.1696451902389526, + "kl": 0.0301513671875, + "learning_rate": 5.02962191529556e-07, + "loss": -1.657, + "reward": 0.5250000208616257, + "reward_std": 0.23356524109840393, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.7916666865348816, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 2784.0001220703125, + "epoch": 0.1662857142857143, + "grad_norm": 1.8924946784973145, + "kl": 0.0260009765625, + "learning_rate": 4.998389805071536e-07, + "loss": -2.9789, + "reward": 0.32500001788139343, + "reward_std": 0.33025872707366943, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.4583333432674408, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completion_length": 1551.4166870117188, + "epoch": 0.16685714285714287, + "grad_norm": 1.2713048458099365, + "kl": 0.01983642578125, + "learning_rate": 4.967182142620745e-07, + "loss": -2.4029, + "reward": 0.5250000208616257, + "reward_std": 0.27286098897457123, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.7916666865348816, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 1370.5417175292969, + "epoch": 0.16742857142857143, + "grad_norm": 0.436497300863266, + "kl": 0.013275146484375, + "learning_rate": 4.93600044896063e-07, + "loss": 0.0021, + "reward": 0.6750000417232513, + "reward_std": 0.13869691640138626, + "rewards/accuracy_reward": 0.3750000223517418, + "rewards/format_reward": 0.75, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completion_length": 2411.7500610351562, + "epoch": 0.168, + "grad_norm": 0.9047071933746338, + "kl": 0.021728515625, + "learning_rate": 4.904846243842949e-07, + "loss": -1.6188, + "reward": 0.30000003427267075, + "reward_std": 0.1549193412065506, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.5000000223517418, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completion_length": 2314.9583740234375, + "epoch": 0.16857142857142857, + "grad_norm": 0.8080445528030396, + "kl": 0.02862548828125, + "learning_rate": 4.873721045679706e-07, + "loss": -2.1152, + "reward": 0.30000003427267075, + "reward_std": 0.19993416219949722, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.5000000223517418, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completion_length": 2848.9583740234375, + "epoch": 0.16914285714285715, + "grad_norm": 0.5764908790588379, + "kl": 0.02508544921875, + "learning_rate": 4.842626371469149e-07, + "loss": -0.6015, + "reward": 0.17500000819563866, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.25, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 2570.166748046875, + "epoch": 0.1697142857142857, + "grad_norm": 1.0034595727920532, + "kl": 0.02734375, + "learning_rate": 4.811563736721829e-07, + "loss": -2.3512, + "reward": 0.45000000298023224, + "reward_std": 0.2658701241016388, + "rewards/accuracy_reward": 0.3333333544433117, + "rewards/format_reward": 0.4166666865348816, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completion_length": 1806.4584350585938, + "epoch": 0.1702857142857143, + "grad_norm": 0.8656820058822632, + "kl": 0.0198974609375, + "learning_rate": 4.780534655386743e-07, + "loss": -1.6228, + "reward": 0.5250000506639481, + "reward_std": 0.29361625015735626, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.6666666716337204, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completion_length": 1468.416748046875, + "epoch": 0.17085714285714285, + "grad_norm": 1.4623134136199951, + "kl": 0.025634765625, + "learning_rate": 4.749540639777539e-07, + "loss": -2.5606, + "reward": 0.4750000089406967, + "reward_std": 0.31102490425109863, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.7083333432674408, + "step": 299 + }, + { + "clip_ratio": 0.0, + "completion_length": 2191.3333740234375, + "epoch": 0.17142857142857143, + "grad_norm": 0.7832638621330261, + "kl": 0.02301025390625, + "learning_rate": 4.7185832004988133e-07, + "loss": -1.4816, + "reward": 0.32500001788139343, + "reward_std": 0.13869691640138626, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.5416666865348816, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 2002.916748046875, + "epoch": 0.172, + "grad_norm": 0.4890463948249817, + "kl": 0.013916015625, + "learning_rate": 1.5267358321348285e-07, + "loss": -0.6248, + "reward": 0.3500000238418579, + "reward_std": 0.0774596706032753, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.5833333432674408, + "step": 301 + }, + { + "clip_ratio": 0.0, + "completion_length": 1066.3333435058594, + "epoch": 0.17257142857142857, + "grad_norm": 1.2706708908081055, + "kl": 0.01739501953125, + "learning_rate": 1.5058639494795067e-07, + "loss": -1.6018, + "reward": 0.7750000357627869, + "reward_std": 0.20295868068933487, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/format_reward": 0.8750000298023224, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completion_length": 1463.7083435058594, + "epoch": 0.17314285714285715, + "grad_norm": 0.5108232498168945, + "kl": 0.01483154296875, + "learning_rate": 1.485389347912525e-07, + "loss": -0.8718, + "reward": 0.6500000357627869, + "reward_std": 0.22085529565811157, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.9166666865348816, + "step": 303 + }, + { + "clip_ratio": 0.0, + "completion_length": 2589.916748046875, + "epoch": 0.1737142857142857, + "grad_norm": 1.3276212215423584, + "kl": 0.013671875, + "learning_rate": 1.4653140639624066e-07, + "loss": -3.066, + "reward": 0.45000001788139343, + "reward_std": 0.43685072660446167, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.5833333432674408, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completion_length": 2135.0, + "epoch": 0.1742857142857143, + "grad_norm": 1.30924654006958, + "kl": 0.012664794921875, + "learning_rate": 1.4456400944391144e-07, + "loss": -0.9444, + "reward": 0.3750000223517418, + "reward_std": 0.18371173739433289, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.5833333358168602, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completion_length": 1213.0833740234375, + "epoch": 0.17485714285714285, + "grad_norm": 1.3626219034194946, + "kl": 0.02374267578125, + "learning_rate": 1.4263693962354336e-07, + "loss": -1.6655, + "reward": 0.7500000596046448, + "reward_std": 0.3175487816333771, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/format_reward": 0.8333333432674408, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completion_length": 2899.25, + "epoch": 0.17542857142857143, + "grad_norm": 0.6594395041465759, + "kl": 0.01434326171875, + "learning_rate": 1.4075038861323302e-07, + "loss": -0.8706, + "reward": 0.27500003576278687, + "reward_std": 0.18371173739433289, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.3333333432674408, + "step": 307 + }, + { + "clip_ratio": 0.0, + "completion_length": 1401.2500915527344, + "epoch": 0.176, + "grad_norm": 0.5402439832687378, + "kl": 0.016876220703125, + "learning_rate": 1.3890454406082956e-07, + "loss": 0.0028, + "reward": 0.9000000357627869, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.75, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completion_length": 2200.416748046875, + "epoch": 0.17657142857142857, + "grad_norm": 0.667547881603241, + "kl": 0.024871826171875, + "learning_rate": 1.3709958956526974e-07, + "loss": -1.5809, + "reward": 0.32500000298023224, + "reward_std": 0.1596180498600006, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.5416666865348816, + "step": 309 + }, + { + "clip_ratio": 0.0, + "completion_length": 2872.5833740234375, + "epoch": 0.17714285714285713, + "grad_norm": 0.6462530493736267, + "kl": 0.014068603515625, + "learning_rate": 1.353357046583165e-07, + "loss": -1.4915, + "reward": 0.30000001192092896, + "reward_std": 0.2258318066596985, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.3750000149011612, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 2144.3333740234375, + "epoch": 0.1777142857142857, + "grad_norm": 0.760235607624054, + "kl": 0.012969970703125, + "learning_rate": 1.3361306478670148e-07, + "loss": -2.1933, + "reward": 0.3500000089406967, + "reward_std": 0.23826396465301514, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.5416666865348816, + "step": 311 + }, + { + "clip_ratio": 0.0, + "completion_length": 1499.1250610351562, + "epoch": 0.1782857142857143, + "grad_norm": 1.0971490144729614, + "kl": 0.02130126953125, + "learning_rate": 1.3193184129467384e-07, + "loss": -0.6513, + "reward": 0.4750000238418579, + "reward_std": 0.06123724579811096, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7916666865348816, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completion_length": 1944.8751220703125, + "epoch": 0.17885714285714285, + "grad_norm": 0.6792640089988708, + "kl": 0.01654052734375, + "learning_rate": 1.3029220140695756e-07, + "loss": -0.4889, + "reward": 0.3750000298023224, + "reward_std": 0.18371173739433289, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.5416666865348816, + "step": 313 + }, + { + "clip_ratio": 0.0, + "completion_length": 1524.5000610351562, + "epoch": 0.17942857142857144, + "grad_norm": 1.003913402557373, + "kl": 0.015625, + "learning_rate": 1.2869430821211826e-07, + "loss": -1.2384, + "reward": 0.7500000298023224, + "reward_std": 0.2173428237438202, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/format_reward": 0.7083333432674408, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completion_length": 2062.4166870117188, + "epoch": 0.18, + "grad_norm": 0.8467549085617065, + "kl": 0.01544189453125, + "learning_rate": 1.2713832064634125e-07, + "loss": -1.1761, + "reward": 0.40000002086162567, + "reward_std": 0.14339563250541687, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.6666666716337204, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completion_length": 1394.25, + "epoch": 0.18057142857142858, + "grad_norm": 1.1105024814605713, + "kl": 0.021453857421875, + "learning_rate": 1.2562439347762275e-07, + "loss": -1.5211, + "reward": 0.5500000417232513, + "reward_std": 0.19993415474891663, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.875, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completion_length": 1474.3333740234375, + "epoch": 0.18114285714285713, + "grad_norm": 0.979469358921051, + "kl": 0.011138916015625, + "learning_rate": 1.2415267729037608e-07, + "loss": -2.2617, + "reward": 0.6500000059604645, + "reward_std": 0.3548534959554672, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/format_reward": 0.7916666865348816, + "step": 317 + }, + { + "clip_ratio": 0.0, + "completion_length": 1628.416748046875, + "epoch": 0.18171428571428572, + "grad_norm": 0.563599705696106, + "kl": 0.01849365234375, + "learning_rate": 1.2272331847045313e-07, + "loss": 0.003, + "reward": 0.7500000894069672, + "reward_std": 0.19993416219949722, + "rewards/accuracy_reward": 0.5000000223517418, + "rewards/format_reward": 0.75, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completion_length": 1559.5833435058594, + "epoch": 0.18228571428571427, + "grad_norm": 0.9234899282455444, + "kl": 0.01715087890625, + "learning_rate": 1.2133645919058418e-07, + "loss": -1.6808, + "reward": 0.6250000149011612, + "reward_std": 0.19037556648254395, + "rewards/accuracy_reward": 0.2916666679084301, + "rewards/format_reward": 0.75, + "step": 319 + }, + { + "clip_ratio": 0.0, + "completion_length": 1388.8333740234375, + "epoch": 0.18285714285714286, + "grad_norm": 1.0862469673156738, + "kl": 0.020263671875, + "learning_rate": 1.1999223739623666e-07, + "loss": -1.6945, + "reward": 0.6000000089406967, + "reward_std": 0.2658701241016388, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.8333333432674408, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 2178.0001220703125, + "epoch": 0.18342857142857144, + "grad_norm": 1.5233582258224487, + "kl": 0.022247314453125, + "learning_rate": 1.1869078679189393e-07, + "loss": -1.4616, + "reward": 0.32500001788139343, + "reward_std": 0.21615657955408096, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.458333358168602, + "step": 321 + }, + { + "clip_ratio": 0.0, + "completion_length": 2045.166748046875, + "epoch": 0.184, + "grad_norm": 1.028342843055725, + "kl": 0.01947021484375, + "learning_rate": 1.1743223682775649e-07, + "loss": -1.4788, + "reward": 0.40000002086162567, + "reward_std": 0.23826396465301514, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.5833333432674408, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completion_length": 1766.9583740234375, + "epoch": 0.18457142857142858, + "grad_norm": 1.2755353450775146, + "kl": 0.0216064453125, + "learning_rate": 1.1621671268686605e-07, + "loss": -2.4408, + "reward": 0.5000000298023224, + "reward_std": 0.3290724903345108, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.7083333432674408, + "step": 323 + }, + { + "clip_ratio": 0.0, + "completion_length": 1399.8750610351562, + "epoch": 0.18514285714285714, + "grad_norm": 0.7948786020278931, + "kl": 0.01861572265625, + "learning_rate": 1.1504433527265378e-07, + "loss": -1.615, + "reward": 0.6000000089406967, + "reward_std": 0.31572362780570984, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.8333333432674408, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completion_length": 1793.8334350585938, + "epoch": 0.18571428571428572, + "grad_norm": 1.1149201393127441, + "kl": 0.02008056640625, + "learning_rate": 1.1391522119691496e-07, + "loss": -2.113, + "reward": 0.4750000238418579, + "reward_std": 0.3386310636997223, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.6666666865348816, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completion_length": 1259.4583740234375, + "epoch": 0.18628571428571428, + "grad_norm": 0.9218916296958923, + "kl": 0.024658203125, + "learning_rate": 1.1282948276820962e-07, + "loss": -0.7426, + "reward": 0.6000000536441803, + "reward_std": 0.22085530310869217, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.8333333432674408, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completion_length": 1897.2083740234375, + "epoch": 0.18685714285714286, + "grad_norm": 1.3489266633987427, + "kl": 0.02215576171875, + "learning_rate": 1.1178722798069215e-07, + "loss": -2.2226, + "reward": 0.5000000149011612, + "reward_std": 0.2773938328027725, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.6666666865348816, + "step": 327 + }, + { + "clip_ratio": 0.0, + "completion_length": 1416.5833435058594, + "epoch": 0.18742857142857142, + "grad_norm": 0.8123278617858887, + "kl": 0.02020263671875, + "learning_rate": 1.10788560503369e-07, + "loss": 0.0032, + "reward": 0.5500000268220901, + "reward_std": 0.14339563250541687, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.75, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completion_length": 1962.4583740234375, + "epoch": 0.188, + "grad_norm": 0.5757811665534973, + "kl": 0.019775390625, + "learning_rate": 1.0983357966978745e-07, + "loss": -0.9902, + "reward": 0.42499999701976776, + "reward_std": 0.20463287830352783, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.625, + "step": 329 + }, + { + "clip_ratio": 0.0, + "completion_length": 1766.2500610351562, + "epoch": 0.18857142857142858, + "grad_norm": 0.9221534132957458, + "kl": 0.01800537109375, + "learning_rate": 1.0892238046815527e-07, + "loss": -1.6717, + "reward": 0.5500000268220901, + "reward_std": 0.2983149588108063, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/format_reward": 0.7083333432674408, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 1579.5833435058594, + "epoch": 0.18914285714285714, + "grad_norm": 0.9036383628845215, + "kl": 0.01678466796875, + "learning_rate": 1.0805505353189254e-07, + "loss": -2.0184, + "reward": 0.5750000476837158, + "reward_std": 0.18371173739433289, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.7083333432674408, + "step": 331 + }, + { + "clip_ratio": 0.0, + "completion_length": 1057.0, + "epoch": 0.18971428571428572, + "grad_norm": 0.5648021697998047, + "kl": 0.010589599609375, + "learning_rate": 1.0723168513061665e-07, + "loss": -0.7254, + "reward": 0.7000000476837158, + "reward_std": 0.17232800275087357, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/format_reward": 0.9583333432674408, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completion_length": 1274.291748046875, + "epoch": 0.19028571428571428, + "grad_norm": 0.8038429617881775, + "kl": 0.01422119140625, + "learning_rate": 1.0645235716156168e-07, + "loss": -1.4998, + "reward": 0.5500000417232513, + "reward_std": 0.17232800275087357, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.8750000298023224, + "step": 333 + }, + { + "clip_ratio": 0.0, + "completion_length": 1840.2084350585938, + "epoch": 0.19085714285714286, + "grad_norm": 0.7369369864463806, + "kl": 0.01922607421875, + "learning_rate": 1.0571714714143197e-07, + "loss": -0.7416, + "reward": 0.5750000476837158, + "reward_std": 0.18371173739433289, + "rewards/accuracy_reward": 0.2916666865348816, + "rewards/format_reward": 0.6666666865348816, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completion_length": 1033.6666870117188, + "epoch": 0.19142857142857142, + "grad_norm": 0.9152414202690125, + "kl": 0.012664794921875, + "learning_rate": 1.0502612819869216e-07, + "loss": -0.759, + "reward": 0.5750000476837158, + "reward_std": 0.11291590332984924, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.9166666865348816, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completion_length": 929.3333435058594, + "epoch": 0.192, + "grad_norm": 0.354445219039917, + "kl": 0.016204833984375, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.0026, + "reward": 0.7750000357627869, + "reward_std": 0.1596180573105812, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/format_reward": 1.0, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 1763.0000915527344, + "epoch": 0.19257142857142856, + "grad_norm": 1.0156564712524414, + "kl": 0.015411376953125, + "learning_rate": 1.0377693407483638e-07, + "loss": -1.6813, + "reward": 0.6500000357627869, + "reward_std": 0.2658701241016388, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.8333333432674408, + "step": 337 + }, + { + "clip_ratio": 0.0, + "completion_length": 1598.7083435058594, + "epoch": 0.19314285714285714, + "grad_norm": 0.8538982272148132, + "kl": 0.021728515625, + "learning_rate": 1.032188831461732e-07, + "loss": -1.2843, + "reward": 0.7000000178813934, + "reward_std": 0.2658701241016388, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/format_reward": 0.7500000298023224, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completion_length": 1587.2083740234375, + "epoch": 0.19371428571428573, + "grad_norm": 1.12893545627594, + "kl": 0.01947021484375, + "learning_rate": 1.0270527178744664e-07, + "loss": -1.5945, + "reward": 0.5000000447034836, + "reward_std": 0.14339563250541687, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8333333432674408, + "step": 339 + }, + { + "clip_ratio": 0.0, + "completion_length": 1353.5833740234375, + "epoch": 0.19428571428571428, + "grad_norm": 0.6671582460403442, + "kl": 0.01910400390625, + "learning_rate": 1.0223615108556937e-07, + "loss": -0.7477, + "reward": 0.6000000536441803, + "reward_std": 0.20871604979038239, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.875, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 1181.7500610351562, + "epoch": 0.19485714285714287, + "grad_norm": 0.725435733795166, + "kl": 0.01190185546875, + "learning_rate": 1.0181156770214242e-07, + "loss": -1.4692, + "reward": 0.5750000476837158, + "reward_std": 0.15610557794570923, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.9166666865348816, + "step": 341 + }, + { + "clip_ratio": 0.0, + "completion_length": 870.9166870117188, + "epoch": 0.19542857142857142, + "grad_norm": 1.4129509925842285, + "kl": 0.017486572265625, + "learning_rate": 1.0143156386881408e-07, + "loss": -0.742, + "reward": 0.7250000834465027, + "reward_std": 0.21615658700466156, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9583333432674408, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completion_length": 1194.5000610351562, + "epoch": 0.196, + "grad_norm": 1.0423340797424316, + "kl": 0.0213623046875, + "learning_rate": 1.0109617738307911e-07, + "loss": -1.4624, + "reward": 0.6500000357627869, + "reward_std": 0.24494898319244385, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.9166666865348816, + "step": 343 + }, + { + "clip_ratio": 0.0, + "completion_length": 1545.8750610351562, + "epoch": 0.19657142857142856, + "grad_norm": 0.3742052912712097, + "kl": 0.0191650390625, + "learning_rate": 1.0080544160451918e-07, + "loss": 0.0031, + "reward": 0.5500000417232513, + "reward_std": 0.14339563250541687, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.75, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completion_length": 1445.9583435058594, + "epoch": 0.19714285714285715, + "grad_norm": 1.0773917436599731, + "kl": 0.019287109375, + "learning_rate": 1.0055938545148495e-07, + "loss": -0.5323, + "reward": 0.7500000596046448, + "reward_std": 0.12247449159622192, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/format_reward": 0.75, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completion_length": 1790.5000610351562, + "epoch": 0.1977142857142857, + "grad_norm": 0.7913344502449036, + "kl": 0.0220947265625, + "learning_rate": 1.0035803339821934e-07, + "loss": -1.419, + "reward": 0.550000011920929, + "reward_std": 0.3209003359079361, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.7500000298023224, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completion_length": 1382.2916870117188, + "epoch": 0.1982857142857143, + "grad_norm": 1.1861852407455444, + "kl": 0.015045166015625, + "learning_rate": 1.002014054724235e-07, + "loss": -1.5114, + "reward": 0.5750000178813934, + "reward_std": 0.21615657955408096, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.8750000298023224, + "step": 347 + }, + { + "clip_ratio": 0.0, + "completion_length": 1589.2500610351562, + "epoch": 0.19885714285714284, + "grad_norm": 0.8138798475265503, + "kl": 0.025634765625, + "learning_rate": 1.0008951725326441e-07, + "loss": -1.7143, + "reward": 0.4750000238418579, + "reward_std": 0.1596180573105812, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7916666865348816, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 1109.5417175292969, + "epoch": 0.19942857142857143, + "grad_norm": 0.814863383769989, + "kl": 0.024444580078125, + "learning_rate": 1.0002237986982564e-07, + "loss": -0.7403, + "reward": 0.7750000357627869, + "reward_std": 0.13869691640138626, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.9583333432674408, + "step": 349 + }, + { + "clip_ratio": 0.0, + "completion_length": 1071.9166870117188, + "epoch": 0.2, + "grad_norm": 0.7820748090744019, + "kl": 0.01959228515625, + "learning_rate": 1e-07, + "loss": -1.4839, + "reward": 0.7000000476837158, + "reward_std": 0.12247449159622192, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9166666865348816, + "step": 350 + }, + { + "epoch": 0.2, + "step": 350, + "total_flos": 0.0, + "train_loss": -0.17849066271579692, + "train_runtime": 2375.2815, + "train_samples_per_second": 3.536, + "train_steps_per_second": 0.147 + } + ], + "logging_steps": 1, + "max_steps": 350, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..c87c458 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfa348d55c327dbf94e4cbc8285bb7697af779bc21981a4889e99a728d7247bc +size 8312