commit ddfd33fab170f24776c89531ef10baf393e58ac2 Author: ModelHub XC Date: Mon Jun 1 00:21:17 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: mimoidochi/OpenRS-GRPO-S-2 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..850630a --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +datasets: knoveleng/open-rs +library_name: transformers +model_name: OpenRS-GRPO-S-2 +tags: +- generated_from_trainer +- open-r1 +- trl +- grpo +licence: license +--- + +# Model Card for OpenRS-GRPO-S-2 + +This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) on the [knoveleng/open-rs](https://huggingface.co/datasets/knoveleng/open-rs) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="mimoidochi/OpenRS-GRPO-S-2", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/vrshy-stanford/huggingface/runs/7vfgighq) + + +This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300). + +### Framework versions + +- TRL: 0.16.0.dev0 +- Transformers: 4.49.0 +- Pytorch: 2.5.1 +- Datasets: 4.5.0 +- Tokenizers: 0.21.4 + +## Citations + +Cite GRPO as: + +```bibtex +@article{zhihong2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, +} + +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..24a5256 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": -1.2733016510141806, + "train_runtime": 18317.2078, + "train_samples": 7000, + "train_samples_per_second": 0.459, + "train_steps_per_second": 0.019 +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..7052064 --- /dev/null +++ b/config.json @@ -0,0 +1,30 @@ +{ + "_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": 4096, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.49.0", + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..01dfe4b --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 151646, + "do_sample": true, + "eos_token_id": 151643, + "temperature": 0.6, + "top_p": 0.95, + "transformers_version": "4.49.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..b79cdd7 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93bc3ec603c3cefeb9d5f7492f456e4193f5833cadd3a9b72ef6be2e47dc90f4 +size 3554214752 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..1d385d6 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..e7cd2c1 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a +size 11422959 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..ef6e98c --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,195 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..24a5256 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": -1.2733016510141806, + "train_runtime": 18317.2078, + "train_samples": 7000, + "train_samples_per_second": 0.459, + "train_steps_per_second": 0.019 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..ded0648 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,4592 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2, + "eval_steps": 500, + "global_step": 350, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 3134.95849609375, + "epoch": 0.0005714285714285715, + "grad_norm": 0.7448968291282654, + "kl": 0.0, + "learning_rate": 2.857142857142857e-08, + "loss": -0.9387, + "reward": 0.1666666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 2868.9583740234375, + "epoch": 0.001142857142857143, + "grad_norm": 0.39778050780296326, + "kl": 0.0, + "learning_rate": 5.714285714285714e-08, + "loss": -0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 2831.75, + "epoch": 0.0017142857142857142, + "grad_norm": 1.477035641670227, + "kl": 2.5451183319091797e-05, + "learning_rate": 8.571428571428572e-08, + "loss": -1.9087, + "reward": 0.2916666865348816, + "reward_std": 0.26603007316589355, + "rewards/accuracy_reward": 0.2916666865348816, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completion_length": 2640.375, + "epoch": 0.002285714285714286, + "grad_norm": 1.3637254238128662, + "kl": 2.60770320892334e-05, + "learning_rate": 1.1428571428571427e-07, + "loss": -0.9903, + "reward": 0.125, + "reward_std": 0.1369306445121765, + "rewards/accuracy_reward": 0.125, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 2990.416748046875, + "epoch": 0.002857142857142857, + "grad_norm": 0.9073159694671631, + "kl": 4.756450653076172e-05, + "learning_rate": 1.4285714285714285e-07, + "loss": -0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 2824.9583740234375, + "epoch": 0.0034285714285714284, + "grad_norm": 0.2965989410877228, + "kl": 3.439188003540039e-05, + "learning_rate": 1.7142857142857143e-07, + "loss": -0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 3328.125, + "epoch": 0.004, + "grad_norm": 0.6783003807067871, + "kl": 3.141164779663086e-05, + "learning_rate": 2e-07, + "loss": -0.735, + "reward": 0.0416666679084301, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completion_length": 2861.7501220703125, + "epoch": 0.004571428571428572, + "grad_norm": 0.6755802035331726, + "kl": 2.0116567611694336e-05, + "learning_rate": 2.2857142857142855e-07, + "loss": -0.733, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 2332.33349609375, + "epoch": 0.005142857142857143, + "grad_norm": 1.6076233386993408, + "kl": 4.553794860839844e-05, + "learning_rate": 2.571428571428571e-07, + "loss": -1.4705, + "reward": 0.2500000111758709, + "reward_std": 0.20412415266036987, + "rewards/accuracy_reward": 0.2500000111758709, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completion_length": 2922.7083740234375, + "epoch": 0.005714285714285714, + "grad_norm": 1.9102457761764526, + "kl": 2.950429916381836e-05, + "learning_rate": 2.857142857142857e-07, + "loss": -2.88, + "reward": 0.2916666716337204, + "reward_std": 0.395129531621933, + "rewards/accuracy_reward": 0.2916666716337204, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 2510.08349609375, + "epoch": 0.006285714285714286, + "grad_norm": 2.036496639251709, + "kl": 5.0187110900878906e-05, + "learning_rate": 3.142857142857143e-07, + "loss": -1.6748, + "reward": 0.2083333395421505, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.2083333395421505, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completion_length": 2951.5001220703125, + "epoch": 0.006857142857142857, + "grad_norm": 3.159651279449463, + "kl": 3.3855438232421875e-05, + "learning_rate": 3.4285714285714286e-07, + "loss": -1.9348, + "reward": 0.2083333358168602, + "reward_std": 0.26603008806705475, + "rewards/accuracy_reward": 0.2083333358168602, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 2773.166748046875, + "epoch": 0.0074285714285714285, + "grad_norm": 0.7064540386199951, + "kl": 1.9043684005737305e-05, + "learning_rate": 3.7142857142857145e-07, + "loss": -1.4774, + "reward": 0.25, + "reward_std": 0.20412413775920868, + "rewards/accuracy_reward": 0.25, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completion_length": 2341.0833740234375, + "epoch": 0.008, + "grad_norm": 0.7222815155982971, + "kl": 4.0650367736816406e-05, + "learning_rate": 4e-07, + "loss": -0.7442, + "reward": 0.4583333432674408, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.4583333432674408, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 2472.791748046875, + "epoch": 0.008571428571428572, + "grad_norm": 0.6996123790740967, + "kl": 2.658367156982422e-05, + "learning_rate": 4.285714285714285e-07, + "loss": -0.7392, + "reward": 0.0416666679084301, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 3173.7501220703125, + "epoch": 0.009142857142857144, + "grad_norm": 1.0215026140213013, + "kl": 4.792213439941406e-05, + "learning_rate": 4.571428571428571e-07, + "loss": -0.9988, + "reward": 0.125, + "reward_std": 0.1369306445121765, + "rewards/accuracy_reward": 0.125, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 3044.916748046875, + "epoch": 0.009714285714285713, + "grad_norm": 0.6144117116928101, + "kl": 4.780292510986328e-05, + "learning_rate": 4.857142857142857e-07, + "loss": -0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completion_length": 3092.2083740234375, + "epoch": 0.010285714285714285, + "grad_norm": 1.0112086534500122, + "kl": 4.398822784423828e-05, + "learning_rate": 5.142857142857142e-07, + "loss": -1.7343, + "reward": 0.1666666716337204, + "reward_std": 0.23899272084236145, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 2726.7501220703125, + "epoch": 0.010857142857142857, + "grad_norm": 0.9500206112861633, + "kl": 5.0067901611328125e-05, + "learning_rate": 5.428571428571428e-07, + "loss": -0.9414, + "reward": 0.1666666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completion_length": 1488.6250915527344, + "epoch": 0.011428571428571429, + "grad_norm": 1.102255940437317, + "kl": 2.771615982055664e-05, + "learning_rate": 5.714285714285714e-07, + "loss": -1.685, + "reward": 0.375, + "reward_std": 0.23116150498390198, + "rewards/accuracy_reward": 0.375, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 1883.8334350585938, + "epoch": 0.012, + "grad_norm": 2.169161558151245, + "kl": 4.553794860839844e-05, + "learning_rate": 6e-07, + "loss": -2.3845, + "reward": 0.1666666716337204, + "reward_std": 0.3332235962152481, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completion_length": 2890.416748046875, + "epoch": 0.012571428571428572, + "grad_norm": 3.423088550567627, + "kl": 3.8743019104003906e-05, + "learning_rate": 6.285714285714286e-07, + "loss": -3.6207, + "reward": 0.5000000298023224, + "reward_std": 0.49719157814979553, + "rewards/accuracy_reward": 0.5000000298023224, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 2703.9583740234375, + "epoch": 0.013142857142857144, + "grad_norm": 1.8604422807693481, + "kl": 5.328655242919922e-05, + "learning_rate": 6.571428571428571e-07, + "loss": -1.8681, + "reward": 0.1666666716337204, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completion_length": 2797.5, + "epoch": 0.013714285714285714, + "grad_norm": 1.5852028131484985, + "kl": 6.42538070678711e-05, + "learning_rate": 6.857142857142857e-07, + "loss": -1.6552, + "reward": 0.291666679084301, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.291666679084301, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 1578.5417175292969, + "epoch": 0.014285714285714285, + "grad_norm": 1.9514633417129517, + "kl": 5.4836273193359375e-05, + "learning_rate": 7.142857142857143e-07, + "loss": -2.4821, + "reward": 0.375, + "reward_std": 0.3410547971725464, + "rewards/accuracy_reward": 0.375, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 2654.666748046875, + "epoch": 0.014857142857142857, + "grad_norm": 1.7990684509277344, + "kl": 7.772445678710938e-05, + "learning_rate": 7.428571428571429e-07, + "loss": -1.8798, + "reward": 0.1666666716337204, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 3421.9583740234375, + "epoch": 0.015428571428571429, + "grad_norm": 0.19747313857078552, + "kl": 9.441375732421875e-05, + "learning_rate": 7.714285714285714e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completion_length": 2514.3751220703125, + "epoch": 0.016, + "grad_norm": 1.2808500528335571, + "kl": 8.916854858398438e-05, + "learning_rate": 8e-07, + "loss": -1.7419, + "reward": 0.1666666679084301, + "reward_std": 0.23899271339178085, + "rewards/accuracy_reward": 0.1666666679084301, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 2835.9583740234375, + "epoch": 0.01657142857142857, + "grad_norm": 2.1129915714263916, + "kl": 9.989738464355469e-05, + "learning_rate": 8.285714285714285e-07, + "loss": -1.9361, + "reward": 0.2083333432674408, + "reward_std": 0.26603007316589355, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completion_length": 3564.416748046875, + "epoch": 0.017142857142857144, + "grad_norm": 1.2208162546157837, + "kl": 0.0001220703125, + "learning_rate": 8.57142857142857e-07, + "loss": -1.8741, + "reward": 0.1666666716337204, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 2305.2916870117188, + "epoch": 0.017714285714285714, + "grad_norm": 1.7113858461380005, + "kl": 0.00023293495178222656, + "learning_rate": 8.857142857142856e-07, + "loss": -1.6837, + "reward": 0.2083333395421505, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.2083333395421505, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completion_length": 2269.6666870117188, + "epoch": 0.018285714285714287, + "grad_norm": 2.3853516578674316, + "kl": 0.00031375885009765625, + "learning_rate": 9.142857142857142e-07, + "loss": -1.8832, + "reward": 0.3333333432674408, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.3333333432674408, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 3584.0, + "epoch": 0.018857142857142857, + "grad_norm": 0.5030116438865662, + "kl": 0.00035381317138671875, + "learning_rate": 9.428571428571428e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completion_length": 2739.6250610351562, + "epoch": 0.019428571428571427, + "grad_norm": 1.6508129835128784, + "kl": 0.00036144256591796875, + "learning_rate": 9.714285714285715e-07, + "loss": -1.8729, + "reward": 0.1666666716337204, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 3151.291748046875, + "epoch": 0.02, + "grad_norm": 0.6652354001998901, + "kl": 0.0004940032958984375, + "learning_rate": 1e-06, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 2798.291748046875, + "epoch": 0.02057142857142857, + "grad_norm": 1.781134843826294, + "kl": 0.0005702972412109375, + "learning_rate": 9.999776201301742e-07, + "loss": -1.6796, + "reward": 0.1250000037252903, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.1250000037252903, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 3469.8751220703125, + "epoch": 0.021142857142857144, + "grad_norm": 1.4302971363067627, + "kl": 0.000423431396484375, + "learning_rate": 9.999104827467354e-07, + "loss": -1.7207, + "reward": 0.1666666679084301, + "reward_std": 0.23899271339178085, + "rewards/accuracy_reward": 0.1666666679084301, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completion_length": 3070.75, + "epoch": 0.021714285714285714, + "grad_norm": 1.0685802698135376, + "kl": 0.0008945465087890625, + "learning_rate": 9.997985945275765e-07, + "loss": -1.7185, + "reward": 0.1666666679084301, + "reward_std": 0.23899271339178085, + "rewards/accuracy_reward": 0.1666666679084301, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 1992.791748046875, + "epoch": 0.022285714285714287, + "grad_norm": 1.3023139238357544, + "kl": 0.00078582763671875, + "learning_rate": 9.996419666017806e-07, + "loss": -0.9405, + "reward": 0.4166666865348816, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.4166666865348816, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completion_length": 3451.2083740234375, + "epoch": 0.022857142857142857, + "grad_norm": 0.3098903298377991, + "kl": 0.000820159912109375, + "learning_rate": 9.994406145485149e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 2855.041748046875, + "epoch": 0.023428571428571427, + "grad_norm": 0.9272823333740234, + "kl": 0.001934051513671875, + "learning_rate": 9.991945583954808e-07, + "loss": -0.7224, + "reward": 0.2916666865348816, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.2916666865348816, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completion_length": 2162.3334350585938, + "epoch": 0.024, + "grad_norm": 1.3229894638061523, + "kl": 0.00215911865234375, + "learning_rate": 9.989038226169207e-07, + "loss": -1.6771, + "reward": 0.1250000037252903, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.1250000037252903, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 3419.291748046875, + "epoch": 0.02457142857142857, + "grad_norm": 1.2003979682922363, + "kl": 0.00185394287109375, + "learning_rate": 9.985684361311858e-07, + "loss": -1.4837, + "reward": 0.0833333358168602, + "reward_std": 0.20412414520978928, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completion_length": 2956.9583740234375, + "epoch": 0.025142857142857144, + "grad_norm": 1.1588594913482666, + "kl": 0.00295257568359375, + "learning_rate": 9.981884322978574e-07, + "loss": -1.8687, + "reward": 0.1666666716337204, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 3275.166748046875, + "epoch": 0.025714285714285714, + "grad_norm": 0.5378715991973877, + "kl": 0.00176239013671875, + "learning_rate": 9.977638489144307e-07, + "loss": -0.731, + "reward": 0.0416666679084301, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 2521.20849609375, + "epoch": 0.026285714285714287, + "grad_norm": 1.5500916242599487, + "kl": 0.00409698486328125, + "learning_rate": 9.972947282125533e-07, + "loss": -2.6114, + "reward": 0.2916666716337204, + "reward_std": 0.3602609634399414, + "rewards/accuracy_reward": 0.2916666716337204, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 2619.5416870117188, + "epoch": 0.026857142857142857, + "grad_norm": 0.6222373247146606, + "kl": 0.00214385986328125, + "learning_rate": 9.967811168538266e-07, + "loss": -0.7438, + "reward": 0.4583333432674408, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.4583333432674408, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completion_length": 1774.7500610351562, + "epoch": 0.027428571428571427, + "grad_norm": 2.3108158111572266, + "kl": 0.00504302978515625, + "learning_rate": 9.962230659251635e-07, + "loss": -2.8747, + "reward": 0.375, + "reward_std": 0.395129531621933, + "rewards/accuracy_reward": 0.375, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 2151.1666870117188, + "epoch": 0.028, + "grad_norm": 1.405102252960205, + "kl": 0.00498199462890625, + "learning_rate": 9.956206309337066e-07, + "loss": -1.7383, + "reward": 0.1666666716337204, + "reward_std": 0.23899272084236145, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completion_length": 2918.75, + "epoch": 0.02857142857142857, + "grad_norm": 0.5462047457695007, + "kl": 0.0054779052734375, + "learning_rate": 9.949738718013078e-07, + "loss": -0.7418, + "reward": 0.2916666865348816, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.2916666865348816, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 2755.2084350585938, + "epoch": 0.029142857142857144, + "grad_norm": 1.4510318040847778, + "kl": 0.003520965576171875, + "learning_rate": 9.94282852858568e-07, + "loss": -1.4865, + "reward": 0.2500000111758709, + "reward_std": 0.20412414520978928, + "rewards/accuracy_reward": 0.2500000111758709, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completion_length": 3465.5, + "epoch": 0.029714285714285714, + "grad_norm": 0.6955327987670898, + "kl": 0.003753662109375, + "learning_rate": 9.935476428384382e-07, + "loss": -0.9347, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 3004.291748046875, + "epoch": 0.030285714285714287, + "grad_norm": 1.4090065956115723, + "kl": 0.0052032470703125, + "learning_rate": 9.927683148693833e-07, + "loss": -1.6757, + "reward": 0.1250000037252903, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.1250000037252903, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completion_length": 3041.541748046875, + "epoch": 0.030857142857142857, + "grad_norm": 0.2869970500469208, + "kl": 0.0038604736328125, + "learning_rate": 9.919449464681074e-07, + "loss": 0.0006, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 3156.916748046875, + "epoch": 0.03142857142857143, + "grad_norm": 0.863756537437439, + "kl": 0.0051727294921875, + "learning_rate": 9.910776195318447e-07, + "loss": -1.6736, + "reward": 0.2083333432674408, + "reward_std": 0.23116151988506317, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 2776.041748046875, + "epoch": 0.032, + "grad_norm": 0.8829509615898132, + "kl": 0.00750732421875, + "learning_rate": 9.901664203302124e-07, + "loss": -0.7267, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 2872.666748046875, + "epoch": 0.03257142857142857, + "grad_norm": 1.085909366607666, + "kl": 0.0074005126953125, + "learning_rate": 9.89211439496631e-07, + "loss": -2.6113, + "reward": 0.2083333395421505, + "reward_std": 0.3602609634399414, + "rewards/accuracy_reward": 0.2083333395421505, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completion_length": 3217.875, + "epoch": 0.03314285714285714, + "grad_norm": 0.8099249005317688, + "kl": 0.0081024169921875, + "learning_rate": 9.882127720193078e-07, + "loss": -1.7368, + "reward": 0.1666666679084301, + "reward_std": 0.23899271339178085, + "rewards/accuracy_reward": 0.1666666679084301, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 2047.6250610351562, + "epoch": 0.03371428571428572, + "grad_norm": 0.9201177358627319, + "kl": 0.01129150390625, + "learning_rate": 9.871705172317903e-07, + "loss": -1.4821, + "reward": 0.0833333358168602, + "reward_std": 0.20412413775920868, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completion_length": 3198.166748046875, + "epoch": 0.03428571428571429, + "grad_norm": 0.7448273301124573, + "kl": 0.009307861328125, + "learning_rate": 9.86084778803085e-07, + "loss": -1.7306, + "reward": 0.1666666716337204, + "reward_std": 0.23899272084236145, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 2209.9583740234375, + "epoch": 0.03485714285714286, + "grad_norm": 2.11501145362854, + "kl": 0.006805419921875, + "learning_rate": 9.849556647273461e-07, + "loss": -3.6049, + "reward": 0.4166666716337204, + "reward_std": 0.4971916079521179, + "rewards/accuracy_reward": 0.4166666716337204, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completion_length": 2689.3333740234375, + "epoch": 0.03542857142857143, + "grad_norm": 0.3600199520587921, + "kl": 0.00649261474609375, + "learning_rate": 9.83783287313134e-07, + "loss": 0.001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 3274.6251220703125, + "epoch": 0.036, + "grad_norm": 0.7515490055084229, + "kl": 0.009490966796875, + "learning_rate": 9.825677631722435e-07, + "loss": -1.4768, + "reward": 0.0833333358168602, + "reward_std": 0.20412415266036987, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completion_length": 3410.041748046875, + "epoch": 0.036571428571428574, + "grad_norm": 0.2419460266828537, + "kl": 0.006988525390625, + "learning_rate": 9.81309213208106e-07, + "loss": 0.0011, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 3360.58349609375, + "epoch": 0.037142857142857144, + "grad_norm": 1.0176174640655518, + "kl": 0.0145263671875, + "learning_rate": 9.800077626037633e-07, + "loss": -0.9358, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 1987.7084350585938, + "epoch": 0.037714285714285714, + "grad_norm": 1.1186784505844116, + "kl": 0.008026123046875, + "learning_rate": 9.786635408094157e-07, + "loss": -1.7376, + "reward": 0.1666666716337204, + "reward_std": 0.23899272084236145, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completion_length": 1821.4166870117188, + "epoch": 0.038285714285714284, + "grad_norm": 2.2777140140533447, + "kl": 0.013885498046875, + "learning_rate": 9.772766815295467e-07, + "loss": -2.2118, + "reward": 0.1250000037252903, + "reward_std": 0.3061862140893936, + "rewards/accuracy_reward": 0.1250000037252903, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completion_length": 3485.416748046875, + "epoch": 0.038857142857142854, + "grad_norm": 1.044973373413086, + "kl": 0.008209228515625, + "learning_rate": 9.758473227096238e-07, + "loss": -2.2271, + "reward": 0.1250000037252903, + "reward_std": 0.306186206638813, + "rewards/accuracy_reward": 0.1250000037252903, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 2774.791748046875, + "epoch": 0.03942857142857143, + "grad_norm": 0.6572657823562622, + "kl": 0.00588226318359375, + "learning_rate": 9.743756065223773e-07, + "loss": -0.9946, + "reward": 0.125, + "reward_std": 0.1369306445121765, + "rewards/accuracy_reward": 0.125, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completion_length": 2589.7083740234375, + "epoch": 0.04, + "grad_norm": 1.0476188659667969, + "kl": 0.011474609375, + "learning_rate": 9.728616793536587e-07, + "loss": -0.9965, + "reward": 0.125, + "reward_std": 0.1369306445121765, + "rewards/accuracy_reward": 0.125, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 1670.9166870117188, + "epoch": 0.04057142857142857, + "grad_norm": 0.6323314309120178, + "kl": 0.0037994384765625, + "learning_rate": 9.713056917878816e-07, + "loss": 0.0008, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completion_length": 2806.416748046875, + "epoch": 0.04114285714285714, + "grad_norm": 0.5533509850502014, + "kl": 0.01007080078125, + "learning_rate": 9.697077985930424e-07, + "loss": -0.7427, + "reward": 0.2083333432674408, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 1994.25, + "epoch": 0.04171428571428572, + "grad_norm": 1.0187442302703857, + "kl": 0.0106658935546875, + "learning_rate": 9.68068158705326e-07, + "loss": -0.997, + "reward": 0.125, + "reward_std": 0.1369306445121765, + "rewards/accuracy_reward": 0.125, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completion_length": 2693.125, + "epoch": 0.04228571428571429, + "grad_norm": 0.834760844707489, + "kl": 0.00494384765625, + "learning_rate": 9.663869352132985e-07, + "loss": -1.6737, + "reward": 0.2083333395421505, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.2083333395421505, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completion_length": 2046.6666870117188, + "epoch": 0.04285714285714286, + "grad_norm": 0.8847272992134094, + "kl": 0.01031494140625, + "learning_rate": 9.646642953416834e-07, + "loss": -0.7426, + "reward": 0.2083333432674408, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 3069.7083740234375, + "epoch": 0.04342857142857143, + "grad_norm": 0.8105300664901733, + "kl": 0.0060882568359375, + "learning_rate": 9.6290041043473e-07, + "loss": -1.67, + "reward": 0.1250000037252903, + "reward_std": 0.23116151988506317, + "rewards/accuracy_reward": 0.1250000037252903, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 2794.1251220703125, + "epoch": 0.044, + "grad_norm": 0.8459436893463135, + "kl": 0.011474609375, + "learning_rate": 9.610954559391704e-07, + "loss": -0.9395, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completion_length": 2131.7083740234375, + "epoch": 0.044571428571428574, + "grad_norm": 0.9749023914337158, + "kl": 0.00823974609375, + "learning_rate": 9.592496113867668e-07, + "loss": -1.9386, + "reward": 0.5416666716337204, + "reward_std": 0.26603008806705475, + "rewards/accuracy_reward": 0.5416666716337204, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completion_length": 2000.5833740234375, + "epoch": 0.045142857142857144, + "grad_norm": 0.6731491684913635, + "kl": 0.0103607177734375, + "learning_rate": 9.573630603764566e-07, + "loss": -0.7392, + "reward": 0.2916666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.2916666679084301, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completion_length": 3062.0833740234375, + "epoch": 0.045714285714285714, + "grad_norm": 0.9580934643745422, + "kl": 0.0061187744140625, + "learning_rate": 9.554359905560885e-07, + "loss": -0.988, + "reward": 0.125, + "reward_std": 0.1369306445121765, + "rewards/accuracy_reward": 0.125, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 1195.0000610351562, + "epoch": 0.046285714285714284, + "grad_norm": 1.316493034362793, + "kl": 0.011627197265625, + "learning_rate": 9.534685936037593e-07, + "loss": -1.9331, + "reward": 0.2916666716337204, + "reward_std": 0.26603008806705475, + "rewards/accuracy_reward": 0.2916666716337204, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completion_length": 2899.75, + "epoch": 0.046857142857142854, + "grad_norm": 0.8076637387275696, + "kl": 0.01312255859375, + "learning_rate": 9.514610652087475e-07, + "loss": -1.6822, + "reward": 0.2083333395421505, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.2083333395421505, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completion_length": 869.2917175292969, + "epoch": 0.04742857142857143, + "grad_norm": 1.114304542541504, + "kl": 0.0088043212890625, + "learning_rate": 9.494136050520494e-07, + "loss": -0.7427, + "reward": 0.4583333432674408, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.4583333432674408, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completion_length": 1685.3334350585938, + "epoch": 0.048, + "grad_norm": 1.8916584253311157, + "kl": 0.0077972412109375, + "learning_rate": 9.473264167865171e-07, + "loss": -2.8129, + "reward": 0.2500000074505806, + "reward_std": 0.3872983306646347, + "rewards/accuracy_reward": 0.2500000074505806, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 1882.3750610351562, + "epoch": 0.04857142857142857, + "grad_norm": 0.999433696269989, + "kl": 0.023345947265625, + "learning_rate": 9.451997080166028e-07, + "loss": -0.7365, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 2046.541748046875, + "epoch": 0.04914285714285714, + "grad_norm": 0.9735854268074036, + "kl": 0.0104827880859375, + "learning_rate": 9.430336902777083e-07, + "loss": -2.4278, + "reward": 0.5000000223517418, + "reward_std": 0.3332235813140869, + "rewards/accuracy_reward": 0.5000000223517418, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 2605.2501220703125, + "epoch": 0.04971428571428571, + "grad_norm": 1.1373125314712524, + "kl": 0.019866943359375, + "learning_rate": 9.40828579015145e-07, + "loss": -1.481, + "reward": 0.2500000111758709, + "reward_std": 0.20412413775920868, + "rewards/accuracy_reward": 0.2500000111758709, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completion_length": 2085.6251220703125, + "epoch": 0.05028571428571429, + "grad_norm": 1.1215746402740479, + "kl": 0.01055908203125, + "learning_rate": 9.385845935627039e-07, + "loss": -1.6782, + "reward": 0.1250000037252903, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.1250000037252903, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 1928.4166870117188, + "epoch": 0.05085714285714286, + "grad_norm": 1.2779592275619507, + "kl": 0.022705078125, + "learning_rate": 9.363019571208397e-07, + "loss": -1.722, + "reward": 0.1666666679084301, + "reward_std": 0.23899271339178085, + "rewards/accuracy_reward": 0.1666666679084301, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completion_length": 1391.0833740234375, + "epoch": 0.05142857142857143, + "grad_norm": 1.28791081905365, + "kl": 0.011474609375, + "learning_rate": 9.3398089673447e-07, + "loss": -2.3681, + "reward": 0.4166666865348816, + "reward_std": 0.3332235962152481, + "rewards/accuracy_reward": 0.4166666865348816, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 2919.33349609375, + "epoch": 0.052, + "grad_norm": 0.5967366695404053, + "kl": 0.0152587890625, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0027, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completion_length": 2806.8333740234375, + "epoch": 0.052571428571428575, + "grad_norm": 0.5494978427886963, + "kl": 0.01953125, + "learning_rate": 9.292244313943176e-07, + "loss": 0.0038, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 1886.0833435058594, + "epoch": 0.053142857142857144, + "grad_norm": 0.9120233654975891, + "kl": 0.014862060546875, + "learning_rate": 9.267894995475355e-07, + "loss": -0.7398, + "reward": 0.2916666865348816, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.2916666865348816, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completion_length": 2132.8333740234375, + "epoch": 0.053714285714285714, + "grad_norm": 0.9982142448425293, + "kl": 0.01995849609375, + "learning_rate": 9.24317089923191e-07, + "loss": -1.682, + "reward": 0.291666679084301, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.291666679084301, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 1412.0000305175781, + "epoch": 0.054285714285714284, + "grad_norm": 1.1398898363113403, + "kl": 0.009185791015625, + "learning_rate": 9.218074484421977e-07, + "loss": -2.6066, + "reward": 0.2083333358168602, + "reward_std": 0.3602609485387802, + "rewards/accuracy_reward": 0.2083333358168602, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 3135.33349609375, + "epoch": 0.054857142857142854, + "grad_norm": 0.46777933835983276, + "kl": 0.013153076171875, + "learning_rate": 9.192608247287761e-07, + "loss": -0.7412, + "reward": 0.0416666679084301, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 2038.7083435058594, + "epoch": 0.05542857142857143, + "grad_norm": 1.239867925643921, + "kl": 0.014312744140625, + "learning_rate": 9.166774720856253e-07, + "loss": -0.938, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completion_length": 1562.3333740234375, + "epoch": 0.056, + "grad_norm": 1.027034044265747, + "kl": 0.0085601806640625, + "learning_rate": 9.140576474687263e-07, + "loss": -1.9377, + "reward": 0.5416666716337204, + "reward_std": 0.26603008806705475, + "rewards/accuracy_reward": 0.5416666716337204, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 2020.0001220703125, + "epoch": 0.05657142857142857, + "grad_norm": 1.366735577583313, + "kl": 0.0066680908203125, + "learning_rate": 9.114016114617857e-07, + "loss": -1.8751, + "reward": 0.5000000149011612, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.5000000149011612, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completion_length": 1672.7500610351562, + "epoch": 0.05714285714285714, + "grad_norm": 1.17022705078125, + "kl": 0.00823974609375, + "learning_rate": 9.08709628250315e-07, + "loss": -2.4106, + "reward": 0.4166666865348816, + "reward_std": 0.3332235738635063, + "rewards/accuracy_reward": 0.4166666865348816, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 2906.58349609375, + "epoch": 0.05771428571428571, + "grad_norm": 0.629345715045929, + "kl": 0.009490966796875, + "learning_rate": 9.059819655953535e-07, + "loss": -1.6843, + "reward": 0.291666679084301, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.291666679084301, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completion_length": 1907.1666870117188, + "epoch": 0.05828571428571429, + "grad_norm": 0.8361666202545166, + "kl": 0.01873779296875, + "learning_rate": 9.03218894806835e-07, + "loss": -1.4741, + "reward": 0.2500000111758709, + "reward_std": 0.20412414520978928, + "rewards/accuracy_reward": 0.2500000111758709, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 2738.041748046875, + "epoch": 0.05885714285714286, + "grad_norm": 0.5930432081222534, + "kl": 0.015869140625, + "learning_rate": 9.004206907166023e-07, + "loss": -0.9389, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completion_length": 3003.791748046875, + "epoch": 0.05942857142857143, + "grad_norm": 0.7205336093902588, + "kl": 0.01336669921875, + "learning_rate": 8.975876316510698e-07, + "loss": -1.8796, + "reward": 0.2500000074505806, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.2500000074505806, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 2454.041748046875, + "epoch": 0.06, + "grad_norm": 0.5209367275238037, + "kl": 0.01300048828125, + "learning_rate": 8.9471999940354e-07, + "loss": -0.9357, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 2740.2500610351562, + "epoch": 0.060571428571428575, + "grad_norm": 1.4679958820343018, + "kl": 0.011962890625, + "learning_rate": 8.918180792061751e-07, + "loss": -1.6703, + "reward": 0.2083333432674408, + "reward_std": 0.23116151988506317, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 2093.916748046875, + "epoch": 0.061142857142857145, + "grad_norm": 0.57196444272995, + "kl": 0.0166015625, + "learning_rate": 8.88882159701625e-07, + "loss": -0.9368, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completion_length": 2134.0833740234375, + "epoch": 0.061714285714285715, + "grad_norm": 0.393916517496109, + "kl": 0.010894775390625, + "learning_rate": 8.859125329143175e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 2911.541748046875, + "epoch": 0.062285714285714285, + "grad_norm": 0.3331953287124634, + "kl": 0.013275146484375, + "learning_rate": 8.829094942214127e-07, + "loss": -0.738, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completion_length": 2564.5000610351562, + "epoch": 0.06285714285714286, + "grad_norm": 0.29394540190696716, + "kl": 0.014984130859375, + "learning_rate": 8.798733423234219e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 2804.0, + "epoch": 0.06342857142857143, + "grad_norm": 0.7025728225708008, + "kl": 0.017059326171875, + "learning_rate": 8.768043792144968e-07, + "loss": 0.0027, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completion_length": 2215.0416870117188, + "epoch": 0.064, + "grad_norm": 0.9984528422355652, + "kl": 0.01275634765625, + "learning_rate": 8.737029101523929e-07, + "loss": -1.7326, + "reward": 0.1666666716337204, + "reward_std": 0.23899272084236145, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 2483.2083740234375, + "epoch": 0.06457142857142857, + "grad_norm": 0.7270219326019287, + "kl": 0.010986328125, + "learning_rate": 8.705692436281051e-07, + "loss": -1.8786, + "reward": 0.1666666716337204, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completion_length": 3233.20849609375, + "epoch": 0.06514285714285714, + "grad_norm": 0.5366232395172119, + "kl": 0.016143798828125, + "learning_rate": 8.674036913351838e-07, + "loss": -0.9371, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 2328.7501220703125, + "epoch": 0.06571428571428571, + "grad_norm": 0.6814904808998108, + "kl": 0.012298583984375, + "learning_rate": 8.642065681387327e-07, + "loss": -0.9397, + "reward": 0.1666666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 2270.916748046875, + "epoch": 0.06628571428571428, + "grad_norm": 0.7077198624610901, + "kl": 0.0084075927734375, + "learning_rate": 8.609781920440891e-07, + "loss": -1.4859, + "reward": 0.0833333358168602, + "reward_std": 0.20412413775920868, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 2786.1251220703125, + "epoch": 0.06685714285714285, + "grad_norm": 0.7300746440887451, + "kl": 0.0172119140625, + "learning_rate": 8.57718884165194e-07, + "loss": -1.6803, + "reward": 0.375, + "reward_std": 0.23116150498390198, + "rewards/accuracy_reward": 0.375, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completion_length": 1789.4583740234375, + "epoch": 0.06742857142857143, + "grad_norm": 0.5591477751731873, + "kl": 0.01336669921875, + "learning_rate": 8.544289686926524e-07, + "loss": -0.9944, + "reward": 0.375, + "reward_std": 0.1369306445121765, + "rewards/accuracy_reward": 0.375, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 2085.916748046875, + "epoch": 0.068, + "grad_norm": 0.9207446575164795, + "kl": 0.01312255859375, + "learning_rate": 8.511087728614862e-07, + "loss": -0.7421, + "reward": 0.4583333432674408, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.4583333432674408, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completion_length": 2560.9583740234375, + "epoch": 0.06857142857142857, + "grad_norm": 0.5242665410041809, + "kl": 0.012298583984375, + "learning_rate": 8.477586269185867e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 2857.3333740234375, + "epoch": 0.06914285714285714, + "grad_norm": 0.8910159468650818, + "kl": 0.015045166015625, + "learning_rate": 8.443788640898654e-07, + "loss": -0.9389, + "reward": 0.1666666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completion_length": 1371.8750610351562, + "epoch": 0.06971428571428571, + "grad_norm": 1.075481653213501, + "kl": 0.010223388671875, + "learning_rate": 8.409698205471098e-07, + "loss": -1.9938, + "reward": 0.5, + "reward_std": 0.273861289024353, + "rewards/accuracy_reward": 0.5, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 2519.2500610351562, + "epoch": 0.07028571428571428, + "grad_norm": 1.0758017301559448, + "kl": 0.01434326171875, + "learning_rate": 8.37531835374545e-07, + "loss": -2.2018, + "reward": 0.291666679084301, + "reward_std": 0.3061862215399742, + "rewards/accuracy_reward": 0.291666679084301, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completion_length": 2478.70849609375, + "epoch": 0.07085714285714285, + "grad_norm": 0.8420854210853577, + "kl": 0.017333984375, + "learning_rate": 8.340652505351075e-07, + "loss": -1.6825, + "reward": 0.3750000149011612, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.3750000149011612, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 1974.3333740234375, + "epoch": 0.07142857142857142, + "grad_norm": 0.8553494811058044, + "kl": 0.016143798828125, + "learning_rate": 8.305704108364301e-07, + "loss": -1.9371, + "reward": 0.2083333358168602, + "reward_std": 0.26603008806705475, + "rewards/accuracy_reward": 0.2083333358168602, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 1352.0000305175781, + "epoch": 0.072, + "grad_norm": 0.7503184080123901, + "kl": 0.0130615234375, + "learning_rate": 8.270476638965461e-07, + "loss": -0.7278, + "reward": 0.2916666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.2916666679084301, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 2711.20849609375, + "epoch": 0.07257142857142856, + "grad_norm": 0.6893806457519531, + "kl": 0.0093841552734375, + "learning_rate": 8.234973601093135e-07, + "loss": -1.7343, + "reward": 0.4166666716337204, + "reward_std": 0.23899272084236145, + "rewards/accuracy_reward": 0.4166666716337204, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completion_length": 2195.4583740234375, + "epoch": 0.07314285714285715, + "grad_norm": 0.5396758317947388, + "kl": 0.013031005859375, + "learning_rate": 8.199198526095611e-07, + "loss": -0.9382, + "reward": 0.3333333432674408, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.3333333432674408, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 1736.2916870117188, + "epoch": 0.07371428571428572, + "grad_norm": 1.1960101127624512, + "kl": 0.026214599609375, + "learning_rate": 8.163154972379655e-07, + "loss": -1.7324, + "reward": 0.1666666716337204, + "reward_std": 0.23899272084236145, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completion_length": 1648.625, + "epoch": 0.07428571428571429, + "grad_norm": 0.7403463125228882, + "kl": 0.009918212890625, + "learning_rate": 8.126846525056555e-07, + "loss": -1.4736, + "reward": 0.3333333544433117, + "reward_std": 0.20412413775920868, + "rewards/accuracy_reward": 0.3333333544433117, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 2801.8333740234375, + "epoch": 0.07485714285714286, + "grad_norm": 0.7958399057388306, + "kl": 0.0125732421875, + "learning_rate": 8.090276795585531e-07, + "loss": -0.7389, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completion_length": 2148.2084350585938, + "epoch": 0.07542857142857143, + "grad_norm": 0.6350630521774292, + "kl": 0.0111846923828125, + "learning_rate": 8.053449421414518e-07, + "loss": -1.6738, + "reward": 0.1250000037252903, + "reward_std": 0.23116151988506317, + "rewards/accuracy_reward": 0.1250000037252903, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 2469.25, + "epoch": 0.076, + "grad_norm": 0.8013907670974731, + "kl": 0.014984130859375, + "learning_rate": 8.01636806561836e-07, + "loss": -1.4861, + "reward": 0.4166666865348816, + "reward_std": 0.20412415266036987, + "rewards/accuracy_reward": 0.4166666865348816, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completion_length": 3052.5, + "epoch": 0.07657142857142857, + "grad_norm": 0.9900679588317871, + "kl": 0.020294189453125, + "learning_rate": 7.979036416534461e-07, + "loss": -1.8765, + "reward": 0.2500000074505806, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.2500000074505806, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 2824.541748046875, + "epoch": 0.07714285714285714, + "grad_norm": 0.5742546319961548, + "kl": 0.01898193359375, + "learning_rate": 7.941458187395917e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 1178.8750305175781, + "epoch": 0.07771428571428571, + "grad_norm": 1.0653953552246094, + "kl": 0.01934814453125, + "learning_rate": 7.903637115962179e-07, + "loss": -0.9936, + "reward": 0.125, + "reward_std": 0.1369306445121765, + "rewards/accuracy_reward": 0.125, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 2287.8333740234375, + "epoch": 0.07828571428571429, + "grad_norm": 0.7256926894187927, + "kl": 0.01116943359375, + "learning_rate": 7.86557696414727e-07, + "loss": -1.7341, + "reward": 0.1666666716337204, + "reward_std": 0.23899272084236145, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completion_length": 2149.666748046875, + "epoch": 0.07885714285714286, + "grad_norm": 0.48477721214294434, + "kl": 0.0179443359375, + "learning_rate": 7.827281517645606e-07, + "loss": 0.003, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 2431.5833740234375, + "epoch": 0.07942857142857143, + "grad_norm": 0.7151182293891907, + "kl": 0.01629638671875, + "learning_rate": 7.788754585555441e-07, + "loss": -0.7297, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completion_length": 1989.9166870117188, + "epoch": 0.08, + "grad_norm": 0.5723420977592468, + "kl": 0.0123291015625, + "learning_rate": 7.75e-07, + "loss": -0.9392, + "reward": 0.4166666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.4166666716337204, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 1466.0834350585938, + "epoch": 0.08057142857142857, + "grad_norm": 0.8384633660316467, + "kl": 0.012237548828125, + "learning_rate": 7.7110216157463e-07, + "loss": -0.9396, + "reward": 0.1666666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completion_length": 1722.1251220703125, + "epoch": 0.08114285714285714, + "grad_norm": 1.3027905225753784, + "kl": 0.0177001953125, + "learning_rate": 7.671823309821749e-07, + "loss": -0.9354, + "reward": 0.3333333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.3333333358168602, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 2374.5834350585938, + "epoch": 0.08171428571428571, + "grad_norm": 0.8892642259597778, + "kl": 0.02044677734375, + "learning_rate": 7.632408981128493e-07, + "loss": -1.924, + "reward": 0.4583333432674408, + "reward_std": 0.26603008806705475, + "rewards/accuracy_reward": 0.4583333432674408, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completion_length": 1527.0833740234375, + "epoch": 0.08228571428571428, + "grad_norm": 1.809125542640686, + "kl": 0.0185546875, + "learning_rate": 7.592782550055628e-07, + "loss": -2.7263, + "reward": 0.2916666716337204, + "reward_std": 0.37592336535453796, + "rewards/accuracy_reward": 0.2916666716337204, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 886.1666870117188, + "epoch": 0.08285714285714285, + "grad_norm": 1.7466280460357666, + "kl": 0.01373291015625, + "learning_rate": 7.552947958089233e-07, + "loss": -2.7183, + "reward": 0.2916666679084301, + "reward_std": 0.37592335790395737, + "rewards/accuracy_reward": 0.2916666679084301, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 1453.666748046875, + "epoch": 0.08342857142857144, + "grad_norm": 1.4468218088150024, + "kl": 0.013519287109375, + "learning_rate": 7.512909167420346e-07, + "loss": -2.4829, + "reward": 0.375, + "reward_std": 0.3410547822713852, + "rewards/accuracy_reward": 0.375, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 1603.791748046875, + "epoch": 0.084, + "grad_norm": 1.1846656799316406, + "kl": 0.01263427734375, + "learning_rate": 7.472670160550848e-07, + "loss": -2.6207, + "reward": 0.3750000149011612, + "reward_std": 0.3602609485387802, + "rewards/accuracy_reward": 0.3750000149011612, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completion_length": 1206.4583740234375, + "epoch": 0.08457142857142858, + "grad_norm": 0.9129040837287903, + "kl": 0.01019287109375, + "learning_rate": 7.432234939897342e-07, + "loss": -1.6655, + "reward": 0.1250000037252903, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.1250000037252903, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 1288.4583740234375, + "epoch": 0.08514285714285715, + "grad_norm": 0.93757164478302, + "kl": 0.01715087890625, + "learning_rate": 7.391607527393044e-07, + "loss": -0.9368, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completion_length": 1977.041748046875, + "epoch": 0.08571428571428572, + "grad_norm": 1.4448423385620117, + "kl": 0.017486572265625, + "learning_rate": 7.350791964087752e-07, + "loss": -1.6729, + "reward": 0.2083333395421505, + "reward_std": 0.23116151988506317, + "rewards/accuracy_reward": 0.2083333395421505, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 1958.7917175292969, + "epoch": 0.08628571428571429, + "grad_norm": 2.432656764984131, + "kl": 0.032012939453125, + "learning_rate": 7.309792309745878e-07, + "loss": -0.7388, + "reward": 0.4583333432674408, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.4583333432674408, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completion_length": 2387.1251220703125, + "epoch": 0.08685714285714285, + "grad_norm": 0.5419829487800598, + "kl": 0.016754150390625, + "learning_rate": 7.268612642442656e-07, + "loss": 0.0024, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 2126.3333740234375, + "epoch": 0.08742857142857142, + "grad_norm": 0.5498960018157959, + "kl": 0.01190185546875, + "learning_rate": 7.227257058158502e-07, + "loss": -0.9965, + "reward": 0.375, + "reward_std": 0.1369306445121765, + "rewards/accuracy_reward": 0.375, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completion_length": 2022.7501220703125, + "epoch": 0.088, + "grad_norm": 0.857801616191864, + "kl": 0.011138916015625, + "learning_rate": 7.185729670371604e-07, + "loss": -1.6669, + "reward": 0.4583333432674408, + "reward_std": 0.23116150498390198, + "rewards/accuracy_reward": 0.4583333432674408, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 3400.95849609375, + "epoch": 0.08857142857142856, + "grad_norm": 0.3610871732234955, + "kl": 0.0118408203125, + "learning_rate": 7.144034609648778e-07, + "loss": -0.7363, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 1649.166748046875, + "epoch": 0.08914285714285715, + "grad_norm": 1.4170763492584229, + "kl": 0.012237548828125, + "learning_rate": 7.102176023234605e-07, + "loss": -2.4233, + "reward": 0.4166666716337204, + "reward_std": 0.3332235962152481, + "rewards/accuracy_reward": 0.4166666716337204, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 2151.5001220703125, + "epoch": 0.08971428571428572, + "grad_norm": 0.8594135046005249, + "kl": 0.01202392578125, + "learning_rate": 7.060158074638932e-07, + "loss": -0.7423, + "reward": 0.2083333432674408, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completion_length": 1745.2083740234375, + "epoch": 0.09028571428571429, + "grad_norm": 0.9574673771858215, + "kl": 0.01275634765625, + "learning_rate": 7.017984943222735e-07, + "loss": -2.6742, + "reward": 0.25, + "reward_std": 0.3680921494960785, + "rewards/accuracy_reward": 0.25, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 3289.6251220703125, + "epoch": 0.09085714285714286, + "grad_norm": 0.5987799763679504, + "kl": 0.013763427734375, + "learning_rate": 6.97566082378242e-07, + "loss": -1.4697, + "reward": 0.0833333358168602, + "reward_std": 0.20412414520978928, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completion_length": 3266.5833740234375, + "epoch": 0.09142857142857143, + "grad_norm": 0.5092036724090576, + "kl": 0.012542724609375, + "learning_rate": 6.93318992613258e-07, + "loss": -0.9381, + "reward": 0.1666666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 1581.166748046875, + "epoch": 0.092, + "grad_norm": 0.6672082543373108, + "kl": 0.01666259765625, + "learning_rate": 6.890576474687263e-07, + "loss": -0.7416, + "reward": 0.2083333432674408, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completion_length": 2034.916748046875, + "epoch": 0.09257142857142857, + "grad_norm": 1.0506476163864136, + "kl": 0.0162353515625, + "learning_rate": 6.847824708039786e-07, + "loss": -1.9362, + "reward": 0.2916666716337204, + "reward_std": 0.26603008806705475, + "rewards/accuracy_reward": 0.2916666716337204, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 2280.166748046875, + "epoch": 0.09314285714285714, + "grad_norm": 0.7479243278503418, + "kl": 0.019195556640625, + "learning_rate": 6.804938878541138e-07, + "loss": -0.9382, + "reward": 0.1666666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completion_length": 2798.8751220703125, + "epoch": 0.09371428571428571, + "grad_norm": 0.7925019264221191, + "kl": 0.016387939453125, + "learning_rate": 6.761923251877012e-07, + "loss": -1.9367, + "reward": 0.2083333358168602, + "reward_std": 0.26603008806705475, + "rewards/accuracy_reward": 0.2083333358168602, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 3475.291748046875, + "epoch": 0.09428571428571429, + "grad_norm": 0.44603925943374634, + "kl": 0.01324462890625, + "learning_rate": 6.718782106643523e-07, + "loss": -0.7406, + "reward": 0.0416666679084301, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 3584.0, + "epoch": 0.09485714285714286, + "grad_norm": 0.43778079748153687, + "kl": 0.014923095703125, + "learning_rate": 6.675519733921623e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 1407.8333740234375, + "epoch": 0.09542857142857143, + "grad_norm": 1.5409038066864014, + "kl": 0.01873779296875, + "learning_rate": 6.632140436850289e-07, + "loss": -2.9278, + "reward": 0.5833333432674408, + "reward_std": 0.40296071767807007, + "rewards/accuracy_reward": 0.5833333432674408, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completion_length": 2116.8751220703125, + "epoch": 0.096, + "grad_norm": 1.3273327350616455, + "kl": 0.0230712890625, + "learning_rate": 6.588648530198504e-07, + "loss": -2.4069, + "reward": 0.2500000111758709, + "reward_std": 0.3332235813140869, + "rewards/accuracy_reward": 0.2500000111758709, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 1039.9166870117188, + "epoch": 0.09657142857142857, + "grad_norm": 1.1365933418273926, + "kl": 0.015655517578125, + "learning_rate": 6.545048339936091e-07, + "loss": -2.4803, + "reward": 0.375, + "reward_std": 0.3410547971725464, + "rewards/accuracy_reward": 0.375, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completion_length": 2980.3333740234375, + "epoch": 0.09714285714285714, + "grad_norm": 0.33093100786209106, + "kl": 0.01751708984375, + "learning_rate": 6.501344202803414e-07, + "loss": -0.7351, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 2490.08349609375, + "epoch": 0.09771428571428571, + "grad_norm": 0.4821685254573822, + "kl": 0.011688232421875, + "learning_rate": 6.45754046588003e-07, + "loss": -0.9394, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completion_length": 2099.2083740234375, + "epoch": 0.09828571428571428, + "grad_norm": 0.5119174718856812, + "kl": 0.01446533203125, + "learning_rate": 6.413641486152292e-07, + "loss": -0.7392, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 1332.7917175292969, + "epoch": 0.09885714285714285, + "grad_norm": 1.1657651662826538, + "kl": 0.017425537109375, + "learning_rate": 6.36965163007999e-07, + "loss": -2.2233, + "reward": 0.1250000037252903, + "reward_std": 0.3061862215399742, + "rewards/accuracy_reward": 0.1250000037252903, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completion_length": 2518.7083740234375, + "epoch": 0.09942857142857142, + "grad_norm": 0.38400009274482727, + "kl": 0.02191162109375, + "learning_rate": 6.32557527316202e-07, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 1589.9166870117188, + "epoch": 0.1, + "grad_norm": 1.2936434745788574, + "kl": 0.027374267578125, + "learning_rate": 6.281416799501187e-07, + "loss": -1.7382, + "reward": 0.5833333432674408, + "reward_std": 0.23899272084236145, + "rewards/accuracy_reward": 0.5833333432674408, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 1653.9167175292969, + "epoch": 0.10057142857142858, + "grad_norm": 1.3641895055770874, + "kl": 0.02020263671875, + "learning_rate": 6.23718060136812e-07, + "loss": -2.6149, + "reward": 0.291666679084301, + "reward_std": 0.3602609634399414, + "rewards/accuracy_reward": 0.291666679084301, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 2173.9583435058594, + "epoch": 0.10114285714285715, + "grad_norm": 0.7888267636299133, + "kl": 0.016754150390625, + "learning_rate": 6.1928710787644e-07, + "loss": -0.9389, + "reward": 0.4166666865348816, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.4166666865348816, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completion_length": 3312.25, + "epoch": 0.10171428571428572, + "grad_norm": 0.9001780152320862, + "kl": 0.01397705078125, + "learning_rate": 6.14849263898491e-07, + "loss": -1.876, + "reward": 0.1666666716337204, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 2765.416748046875, + "epoch": 0.10228571428571429, + "grad_norm": 0.5644757747650146, + "kl": 0.015350341796875, + "learning_rate": 6.10404969617945e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completion_length": 2159.08349609375, + "epoch": 0.10285714285714286, + "grad_norm": 1.2660763263702393, + "kl": 0.0147705078125, + "learning_rate": 6.059546670913684e-07, + "loss": -2.7191, + "reward": 0.2916666716337204, + "reward_std": 0.37592336535453796, + "rewards/accuracy_reward": 0.2916666716337204, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 2710.625, + "epoch": 0.10342857142857143, + "grad_norm": 0.7003759145736694, + "kl": 0.02130126953125, + "learning_rate": 6.014987989729444e-07, + "loss": -1.8783, + "reward": 0.2500000074505806, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.2500000074505806, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completion_length": 2344.625, + "epoch": 0.104, + "grad_norm": 0.6448298096656799, + "kl": 0.0184326171875, + "learning_rate": 5.97037808470444e-07, + "loss": -1.4745, + "reward": 0.0833333358168602, + "reward_std": 0.20412415266036987, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 2700.1251220703125, + "epoch": 0.10457142857142857, + "grad_norm": 0.40715157985687256, + "kl": 0.01531982421875, + "learning_rate": 5.925721393011417e-07, + "loss": -0.7211, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completion_length": 1704.2501220703125, + "epoch": 0.10514285714285715, + "grad_norm": 1.0119861364364624, + "kl": 0.02203369140625, + "learning_rate": 5.881022356476804e-07, + "loss": -0.7407, + "reward": 0.4583333432674408, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.4583333432674408, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 1675.4166870117188, + "epoch": 0.10571428571428572, + "grad_norm": 2.0721378326416016, + "kl": 0.025970458984375, + "learning_rate": 5.836285421138909e-07, + "loss": -1.4489, + "reward": 0.0833333358168602, + "reward_std": 0.20412414520978928, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 2593.291748046875, + "epoch": 0.10628571428571429, + "grad_norm": 0.6480510234832764, + "kl": 0.0172119140625, + "learning_rate": 5.791515036805684e-07, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 2512.4583740234375, + "epoch": 0.10685714285714286, + "grad_norm": 0.7385961413383484, + "kl": 0.017333984375, + "learning_rate": 5.74671565661212e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completion_length": 2527.8333740234375, + "epoch": 0.10742857142857143, + "grad_norm": 1.0105268955230713, + "kl": 0.018035888671875, + "learning_rate": 5.701891736577317e-07, + "loss": -2.7295, + "reward": 0.4583333432674408, + "reward_std": 0.37592336535453796, + "rewards/accuracy_reward": 0.4583333432674408, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 3513.25, + "epoch": 0.108, + "grad_norm": 0.6996456384658813, + "kl": 0.01776123046875, + "learning_rate": 5.657047735161255e-07, + "loss": -1.6768, + "reward": 0.2083333432674408, + "reward_std": 0.23116150498390198, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completion_length": 2915.916748046875, + "epoch": 0.10857142857142857, + "grad_norm": 0.6052663922309875, + "kl": 0.012054443359375, + "learning_rate": 5.612188112821328e-07, + "loss": -0.7361, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 2475.5001220703125, + "epoch": 0.10914285714285714, + "grad_norm": 0.3194851279258728, + "kl": 0.0223388671875, + "learning_rate": 5.567317331568686e-07, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completion_length": 2002.1250610351562, + "epoch": 0.10971428571428571, + "grad_norm": 45.269439697265625, + "kl": 0.251220703125, + "learning_rate": 5.522439854524411e-07, + "loss": -2.4264, + "reward": 0.3750000149011612, + "reward_std": 0.3410547897219658, + "rewards/accuracy_reward": 0.3750000149011612, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 2904.2083740234375, + "epoch": 0.11028571428571429, + "grad_norm": 0.3201312720775604, + "kl": 0.01861572265625, + "learning_rate": 5.477560145475589e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completion_length": 1847.7500610351562, + "epoch": 0.11085714285714286, + "grad_norm": 0.9246636629104614, + "kl": 0.0289306640625, + "learning_rate": 5.432682668431314e-07, + "loss": -0.917, + "reward": 0.3333333432674408, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.3333333432674408, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 2270.6666870117188, + "epoch": 0.11142857142857143, + "grad_norm": 0.4476500153541565, + "kl": 0.017730712890625, + "learning_rate": 5.387811887178673e-07, + "loss": 0.0029, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 2440.8333740234375, + "epoch": 0.112, + "grad_norm": 0.5211483836174011, + "kl": 0.011688232421875, + "learning_rate": 5.342952264838747e-07, + "loss": 0.0022, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 2394.9583740234375, + "epoch": 0.11257142857142857, + "grad_norm": 0.533424973487854, + "kl": 0.02685546875, + "learning_rate": 5.298108263422685e-07, + "loss": 0.0048, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completion_length": 2006.25, + "epoch": 0.11314285714285714, + "grad_norm": 1.4151604175567627, + "kl": 0.019134521484375, + "learning_rate": 5.25328434338788e-07, + "loss": -2.6747, + "reward": 0.3333333432674408, + "reward_std": 0.3680921494960785, + "rewards/accuracy_reward": 0.3333333432674408, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 2482.041748046875, + "epoch": 0.11371428571428571, + "grad_norm": 1.1144918203353882, + "kl": 0.019287109375, + "learning_rate": 5.208484963194316e-07, + "loss": -1.4696, + "reward": 0.2500000111758709, + "reward_std": 0.20412414520978928, + "rewards/accuracy_reward": 0.2500000111758709, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completion_length": 2661.916748046875, + "epoch": 0.11428571428571428, + "grad_norm": 0.9477536082267761, + "kl": 0.023193359375, + "learning_rate": 5.163714578861091e-07, + "loss": -0.9854, + "reward": 0.125, + "reward_std": 0.1369306445121765, + "rewards/accuracy_reward": 0.125, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 2519.8333740234375, + "epoch": 0.11485714285714285, + "grad_norm": 1.073439359664917, + "kl": 0.016082763671875, + "learning_rate": 5.118977643523196e-07, + "loss": -0.7384, + "reward": 0.0416666679084301, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completion_length": 2459.916748046875, + "epoch": 0.11542857142857142, + "grad_norm": 1.0262060165405273, + "kl": 0.014678955078125, + "learning_rate": 5.074278606988584e-07, + "loss": -1.6796, + "reward": 0.2083333395421505, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.2083333395421505, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 3012.3751220703125, + "epoch": 0.116, + "grad_norm": 0.8095118403434753, + "kl": 0.01739501953125, + "learning_rate": 5.02962191529556e-07, + "loss": -2.6648, + "reward": 0.25, + "reward_std": 0.3680921643972397, + "rewards/accuracy_reward": 0.25, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completion_length": 2506.3751220703125, + "epoch": 0.11657142857142858, + "grad_norm": 0.5008082985877991, + "kl": 0.013458251953125, + "learning_rate": 4.985012010270557e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 2347.7916870117188, + "epoch": 0.11714285714285715, + "grad_norm": 0.6901289820671082, + "kl": 0.01611328125, + "learning_rate": 4.940453329086318e-07, + "loss": -1.4828, + "reward": 0.0833333358168602, + "reward_std": 0.20412415266036987, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 2616.666748046875, + "epoch": 0.11771428571428572, + "grad_norm": 0.631709098815918, + "kl": 0.02020263671875, + "learning_rate": 4.895950303820552e-07, + "loss": -0.9945, + "reward": 0.125, + "reward_std": 0.1369306445121765, + "rewards/accuracy_reward": 0.125, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completion_length": 2875.8333740234375, + "epoch": 0.11828571428571429, + "grad_norm": 0.7697901129722595, + "kl": 0.014129638671875, + "learning_rate": 4.85150736101509e-07, + "loss": -0.9966, + "reward": 0.125, + "reward_std": 0.1369306445121765, + "rewards/accuracy_reward": 0.125, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completion_length": 2224.3751220703125, + "epoch": 0.11885714285714286, + "grad_norm": 0.7268201112747192, + "kl": 0.011077880859375, + "learning_rate": 4.807128921235598e-07, + "loss": -1.4794, + "reward": 0.5, + "reward_std": 0.20412413775920868, + "rewards/accuracy_reward": 0.5, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 2420.7083740234375, + "epoch": 0.11942857142857143, + "grad_norm": 0.5704336166381836, + "kl": 0.009765625, + "learning_rate": 4.76281939863188e-07, + "loss": 0.0016, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completion_length": 1993.3333740234375, + "epoch": 0.12, + "grad_norm": 0.507115364074707, + "kl": 0.017822265625, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 1624.125, + "epoch": 0.12057142857142857, + "grad_norm": 1.2795109748840332, + "kl": 0.009735107421875, + "learning_rate": 4.67442472683798e-07, + "loss": -2.6766, + "reward": 0.3333333432674408, + "reward_std": 0.3680921494960785, + "rewards/accuracy_reward": 0.3333333432674408, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completion_length": 3554.3333740234375, + "epoch": 0.12114285714285715, + "grad_norm": 0.6074011325836182, + "kl": 0.021728515625, + "learning_rate": 4.6303483699200105e-07, + "loss": -0.9938, + "reward": 0.125, + "reward_std": 0.1369306445121765, + "rewards/accuracy_reward": 0.125, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 1680.666748046875, + "epoch": 0.12171428571428572, + "grad_norm": 1.1823277473449707, + "kl": 0.014251708984375, + "learning_rate": 4.5863585138477077e-07, + "loss": -1.7337, + "reward": 0.1666666716337204, + "reward_std": 0.23899272084236145, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completion_length": 2227.375, + "epoch": 0.12228571428571429, + "grad_norm": 0.7161904573440552, + "kl": 0.01544189453125, + "learning_rate": 4.542459534119971e-07, + "loss": -0.7411, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completion_length": 2544.7501220703125, + "epoch": 0.12285714285714286, + "grad_norm": 0.9380499720573425, + "kl": 0.01983642578125, + "learning_rate": 4.4986557971965856e-07, + "loss": -0.9942, + "reward": 0.125, + "reward_std": 0.1369306445121765, + "rewards/accuracy_reward": 0.125, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 1188.8333740234375, + "epoch": 0.12342857142857143, + "grad_norm": 1.2638156414031982, + "kl": 0.015777587890625, + "learning_rate": 4.454951660063909e-07, + "loss": -1.6826, + "reward": 0.7916666865348816, + "reward_std": 0.23116150498390198, + "rewards/accuracy_reward": 0.7916666865348816, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 3525.25, + "epoch": 0.124, + "grad_norm": 0.28554829955101013, + "kl": 0.01068115234375, + "learning_rate": 4.4113514698014953e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completion_length": 2003.75, + "epoch": 0.12457142857142857, + "grad_norm": 0.7368922233581543, + "kl": 0.02069091796875, + "learning_rate": 4.367859563149712e-07, + "loss": -0.741, + "reward": 0.2083333432674408, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completion_length": 884.8333435058594, + "epoch": 0.12514285714285714, + "grad_norm": 0.8822182416915894, + "kl": 0.01611328125, + "learning_rate": 4.3244802660783775e-07, + "loss": -0.9384, + "reward": 0.4166666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.4166666716337204, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completion_length": 2650.75, + "epoch": 0.12571428571428572, + "grad_norm": 0.6846665143966675, + "kl": 0.014801025390625, + "learning_rate": 4.281217893356478e-07, + "loss": -1.6798, + "reward": 0.3750000111758709, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.3750000111758709, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 1293.7916870117188, + "epoch": 0.12628571428571428, + "grad_norm": 0.869556188583374, + "kl": 0.016845703125, + "learning_rate": 4.2380767481229884e-07, + "loss": -0.7405, + "reward": 0.2916666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.2916666679084301, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completion_length": 1300.9583740234375, + "epoch": 0.12685714285714286, + "grad_norm": 0.937000572681427, + "kl": 0.01593017578125, + "learning_rate": 4.195061121458862e-07, + "loss": -0.939, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completion_length": 2665.791748046875, + "epoch": 0.12742857142857142, + "grad_norm": 0.794379472732544, + "kl": 0.01953125, + "learning_rate": 4.152175291960214e-07, + "loss": -0.9379, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completion_length": 2896.08349609375, + "epoch": 0.128, + "grad_norm": 0.8766622543334961, + "kl": 0.0145263671875, + "learning_rate": 4.1094235253127374e-07, + "loss": -2.6164, + "reward": 0.2916666716337204, + "reward_std": 0.3602609485387802, + "rewards/accuracy_reward": 0.2916666716337204, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 2229.8750610351562, + "epoch": 0.12857142857142856, + "grad_norm": 1.1731126308441162, + "kl": 0.016143798828125, + "learning_rate": 4.0668100738674205e-07, + "loss": -2.4688, + "reward": 0.2083333395421505, + "reward_std": 0.3410547971725464, + "rewards/accuracy_reward": 0.2083333395421505, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 1741.8750915527344, + "epoch": 0.12914285714285714, + "grad_norm": 0.7152168154716492, + "kl": 0.01202392578125, + "learning_rate": 4.0243391762175803e-07, + "loss": -0.9392, + "reward": 0.4166666865348816, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.4166666865348816, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completion_length": 2109.291748046875, + "epoch": 0.12971428571428573, + "grad_norm": 0.4053463339805603, + "kl": 0.0142822265625, + "learning_rate": 3.982015056777265e-07, + "loss": -0.7372, + "reward": 0.2916666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.2916666679084301, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.5833740234375, + "epoch": 0.13028571428571428, + "grad_norm": 1.3437594175338745, + "kl": 0.017852783203125, + "learning_rate": 3.939841925361067e-07, + "loss": -1.7393, + "reward": 0.4166666716337204, + "reward_std": 0.23899272084236145, + "rewards/accuracy_reward": 0.4166666716337204, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 2041.2916870117188, + "epoch": 0.13085714285714287, + "grad_norm": 0.44564294815063477, + "kl": 0.01416015625, + "learning_rate": 3.897823976765394e-07, + "loss": -0.7404, + "reward": 0.2916666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.2916666679084301, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completion_length": 2914.416748046875, + "epoch": 0.13142857142857142, + "grad_norm": 5.425497531890869, + "kl": 0.02838134765625, + "learning_rate": 3.855965390351222e-07, + "loss": -1.8737, + "reward": 0.1666666716337204, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 2574.375, + "epoch": 0.132, + "grad_norm": 1.1059547662734985, + "kl": 0.02020263671875, + "learning_rate": 3.8142703296283953e-07, + "loss": -1.6797, + "reward": 0.2083333432674408, + "reward_std": 0.23116150498390198, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completion_length": 2597.041748046875, + "epoch": 0.13257142857142856, + "grad_norm": 0.80033940076828, + "kl": 0.01983642578125, + "learning_rate": 3.772742941841499e-07, + "loss": -1.4814, + "reward": 0.2500000111758709, + "reward_std": 0.20412413775920868, + "rewards/accuracy_reward": 0.2500000111758709, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 3533.5, + "epoch": 0.13314285714285715, + "grad_norm": 0.4203540086746216, + "kl": 0.015869140625, + "learning_rate": 3.731387357557344e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completion_length": 3225.6251220703125, + "epoch": 0.1337142857142857, + "grad_norm": 0.7065865397453308, + "kl": 0.017547607421875, + "learning_rate": 3.6902076902541214e-07, + "loss": -1.6826, + "reward": 0.2083333432674408, + "reward_std": 0.23116150498390198, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completion_length": 1722.6250610351562, + "epoch": 0.13428571428571429, + "grad_norm": 0.5265027284622192, + "kl": 0.013519287109375, + "learning_rate": 3.649208035912249e-07, + "loss": -0.7374, + "reward": 0.2916666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.2916666679084301, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 1470.5000610351562, + "epoch": 0.13485714285714287, + "grad_norm": 0.5969668030738831, + "kl": 0.01806640625, + "learning_rate": 3.608392472606956e-07, + "loss": 0.0026, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 1409.5000610351562, + "epoch": 0.13542857142857143, + "grad_norm": 1.4041366577148438, + "kl": 0.0216064453125, + "learning_rate": 3.5677650601026585e-07, + "loss": -1.4828, + "reward": 0.0833333358168602, + "reward_std": 0.20412415266036987, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completion_length": 2312.5001220703125, + "epoch": 0.136, + "grad_norm": 1.0515042543411255, + "kl": 0.0201416015625, + "learning_rate": 3.5273298394491515e-07, + "loss": -2.6223, + "reward": 0.291666679084301, + "reward_std": 0.3602609485387802, + "rewards/accuracy_reward": 0.291666679084301, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completion_length": 2486.9166870117188, + "epoch": 0.13657142857142857, + "grad_norm": 0.8044987916946411, + "kl": 0.02252197265625, + "learning_rate": 3.4870908325796527e-07, + "loss": -2.2171, + "reward": 0.3750000223517418, + "reward_std": 0.306186206638813, + "rewards/accuracy_reward": 0.3750000223517418, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completion_length": 1590.8751220703125, + "epoch": 0.13714285714285715, + "grad_norm": 0.6538607478141785, + "kl": 0.015533447265625, + "learning_rate": 3.4470520419107664e-07, + "loss": -0.9312, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 2145.8750610351562, + "epoch": 0.1377142857142857, + "grad_norm": 0.8519302010536194, + "kl": 0.02294921875, + "learning_rate": 3.407217449944373e-07, + "loss": -0.9377, + "reward": 0.4166666865348816, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.4166666865348816, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completion_length": 3469.291748046875, + "epoch": 0.1382857142857143, + "grad_norm": 0.5576246380805969, + "kl": 0.016998291015625, + "learning_rate": 3.367591018871506e-07, + "loss": -0.9348, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completion_length": 2839.541748046875, + "epoch": 0.13885714285714285, + "grad_norm": 0.41596850752830505, + "kl": 0.01690673828125, + "learning_rate": 3.3281766901782517e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completion_length": 2468.5416870117188, + "epoch": 0.13942857142857143, + "grad_norm": 0.8380830883979797, + "kl": 0.01788330078125, + "learning_rate": 3.2889783842536987e-07, + "loss": -0.9381, + "reward": 0.1666666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 2339.041748046875, + "epoch": 0.14, + "grad_norm": 1.0777580738067627, + "kl": 0.01507568359375, + "learning_rate": 3.250000000000001e-07, + "loss": -2.5829, + "reward": 0.2083333358168602, + "reward_std": 0.3602609634399414, + "rewards/accuracy_reward": 0.2083333358168602, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 2758.166748046875, + "epoch": 0.14057142857142857, + "grad_norm": 0.8583000302314758, + "kl": 0.015411376953125, + "learning_rate": 3.211245414444559e-07, + "loss": -1.9945, + "reward": 0.25, + "reward_std": 0.273861289024353, + "rewards/accuracy_reward": 0.25, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completion_length": 2797.291748046875, + "epoch": 0.14114285714285715, + "grad_norm": 1.0498836040496826, + "kl": 0.027252197265625, + "learning_rate": 3.172718482354393e-07, + "loss": -2.6177, + "reward": 0.458333358168602, + "reward_std": 0.3602609485387802, + "rewards/accuracy_reward": 0.458333358168602, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completion_length": 1560.8750610351562, + "epoch": 0.1417142857142857, + "grad_norm": 0.2750161290168762, + "kl": 0.01739501953125, + "learning_rate": 3.1344230358527284e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 2254.375, + "epoch": 0.1422857142857143, + "grad_norm": 0.7589177489280701, + "kl": 0.02679443359375, + "learning_rate": 3.096362884037821e-07, + "loss": -0.7333, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 249 + }, + { + "clip_ratio": 0.0, + "completion_length": 2698.6251220703125, + "epoch": 0.14285714285714285, + "grad_norm": 0.5587661266326904, + "kl": 0.015472412109375, + "learning_rate": 3.058541812604083e-07, + "loss": -0.7418, + "reward": 0.2083333432674408, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 1584.3750610351562, + "epoch": 0.14342857142857143, + "grad_norm": 0.8878784775733948, + "kl": 0.0150146484375, + "learning_rate": 3.020963583465539e-07, + "loss": -1.6822, + "reward": 0.3750000149011612, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.3750000149011612, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completion_length": 1354.0833435058594, + "epoch": 0.144, + "grad_norm": 1.3565187454223633, + "kl": 0.0235595703125, + "learning_rate": 2.9836319343816397e-07, + "loss": -1.6732, + "reward": 0.125, + "reward_std": 0.23116151988506317, + "rewards/accuracy_reward": 0.125, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 2829.625, + "epoch": 0.14457142857142857, + "grad_norm": 0.8856632113456726, + "kl": 0.015411376953125, + "learning_rate": 2.946550578585483e-07, + "loss": -0.7394, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completion_length": 2557.0, + "epoch": 0.14514285714285713, + "grad_norm": 0.5032978653907776, + "kl": 0.01812744140625, + "learning_rate": 2.9097232044144696e-07, + "loss": -1.724, + "reward": 0.1666666679084301, + "reward_std": 0.23899271339178085, + "rewards/accuracy_reward": 0.1666666679084301, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completion_length": 2025.5001220703125, + "epoch": 0.1457142857142857, + "grad_norm": 0.6513389945030212, + "kl": 0.014007568359375, + "learning_rate": 2.8731534749434464e-07, + "loss": -0.9391, + "reward": 0.4166666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.4166666716337204, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 3475.541748046875, + "epoch": 0.1462857142857143, + "grad_norm": 0.5068712830543518, + "kl": 0.01947021484375, + "learning_rate": 2.836845027620346e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 1864.4584350585938, + "epoch": 0.14685714285714285, + "grad_norm": 11.749297142028809, + "kl": 0.2183837890625, + "learning_rate": 2.8008014739043884e-07, + "loss": -1.8304, + "reward": 0.1666666716337204, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completion_length": 1771.0833740234375, + "epoch": 0.14742857142857144, + "grad_norm": 0.6075973510742188, + "kl": 0.009613037109375, + "learning_rate": 2.765026398906865e-07, + "loss": -1.6771, + "reward": 0.2083333395421505, + "reward_std": 0.23116151988506317, + "rewards/accuracy_reward": 0.2083333395421505, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completion_length": 2659.1666870117188, + "epoch": 0.148, + "grad_norm": 0.9356407523155212, + "kl": 0.014862060546875, + "learning_rate": 2.729523361034538e-07, + "loss": -1.6647, + "reward": 0.125, + "reward_std": 0.23116150498390198, + "rewards/accuracy_reward": 0.125, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completion_length": 2985.7083740234375, + "epoch": 0.14857142857142858, + "grad_norm": 0.3902454674243927, + "kl": 0.015960693359375, + "learning_rate": 2.6942958916356994e-07, + "loss": -0.9388, + "reward": 0.1666666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 3379.5833740234375, + "epoch": 0.14914285714285713, + "grad_norm": 0.5188402533531189, + "kl": 0.02239990234375, + "learning_rate": 2.659347494648925e-07, + "loss": -0.7375, + "reward": 0.0416666679084301, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completion_length": 2401.70849609375, + "epoch": 0.14971428571428572, + "grad_norm": 0.5534473657608032, + "kl": 0.014617919921875, + "learning_rate": 2.6246816462545496e-07, + "loss": -0.9393, + "reward": 0.1666666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completion_length": 2452.0833740234375, + "epoch": 0.15028571428571427, + "grad_norm": 0.7840031981468201, + "kl": 0.01806640625, + "learning_rate": 2.5903017945289017e-07, + "loss": -0.9387, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completion_length": 3485.625, + "epoch": 0.15085714285714286, + "grad_norm": 0.45268696546554565, + "kl": 0.013824462890625, + "learning_rate": 2.5562113591013457e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 3424.5001220703125, + "epoch": 0.15142857142857144, + "grad_norm": 0.4016251564025879, + "kl": 0.01983642578125, + "learning_rate": 2.5224137308141336e-07, + "loss": -0.7367, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 2272.375, + "epoch": 0.152, + "grad_norm": 0.6088007092475891, + "kl": 0.0252685546875, + "learning_rate": 2.488912271385139e-07, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completion_length": 1409.5833740234375, + "epoch": 0.15257142857142858, + "grad_norm": 0.706413209438324, + "kl": 0.016357421875, + "learning_rate": 2.4557103130734763e-07, + "loss": -0.7323, + "reward": 0.0416666679084301, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completion_length": 2088.666748046875, + "epoch": 0.15314285714285714, + "grad_norm": 2.1446173191070557, + "kl": 0.019317626953125, + "learning_rate": 2.4228111583480596e-07, + "loss": -1.7378, + "reward": 0.1666666716337204, + "reward_std": 0.23899272084236145, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 2598.20849609375, + "epoch": 0.15371428571428572, + "grad_norm": 0.18762169778347015, + "kl": 0.012420654296875, + "learning_rate": 2.390218079559109e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completion_length": 1651.0833740234375, + "epoch": 0.15428571428571428, + "grad_norm": 0.6639404892921448, + "kl": 0.01995849609375, + "learning_rate": 2.3579343186126726e-07, + "loss": -0.7353, + "reward": 0.2916666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.2916666679084301, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.2500610351562, + "epoch": 0.15485714285714286, + "grad_norm": 0.9474217891693115, + "kl": 0.0179443359375, + "learning_rate": 2.3259630866481605e-07, + "loss": -1.8744, + "reward": 0.1666666716337204, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completion_length": 2655.5834350585938, + "epoch": 0.15542857142857142, + "grad_norm": 0.628149688243866, + "kl": 0.017578125, + "learning_rate": 2.294307563718949e-07, + "loss": -0.7414, + "reward": 0.2083333432674408, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 2282.3333740234375, + "epoch": 0.156, + "grad_norm": 1.2667988538742065, + "kl": 0.0169677734375, + "learning_rate": 2.2629708984760706e-07, + "loss": -1.6717, + "reward": 0.1250000037252903, + "reward_std": 0.23116151988506317, + "rewards/accuracy_reward": 0.1250000037252903, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completion_length": 2919.916748046875, + "epoch": 0.15657142857142858, + "grad_norm": 0.7472139596939087, + "kl": 0.01800537109375, + "learning_rate": 2.2319562078550318e-07, + "loss": -0.9381, + "reward": 0.1666666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completion_length": 1767.5000610351562, + "epoch": 0.15714285714285714, + "grad_norm": 1.192237377166748, + "kl": 0.01751708984375, + "learning_rate": 2.2012665767657823e-07, + "loss": -0.7415, + "reward": 0.2083333432674408, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 1048.2917175292969, + "epoch": 0.15771428571428572, + "grad_norm": 1.2694878578186035, + "kl": 0.018829345703125, + "learning_rate": 2.1709050577858728e-07, + "loss": -1.8771, + "reward": 0.583333358168602, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.583333358168602, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 2410.416748046875, + "epoch": 0.15828571428571428, + "grad_norm": 1.6138927936553955, + "kl": 0.030670166015625, + "learning_rate": 2.1408746708568242e-07, + "loss": -1.93, + "reward": 0.2083333432674408, + "reward_std": 0.26603007316589355, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completion_length": 2286.125, + "epoch": 0.15885714285714286, + "grad_norm": 0.7141113877296448, + "kl": 0.014129638671875, + "learning_rate": 2.1111784029837509e-07, + "loss": -0.742, + "reward": 0.2083333432674408, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completion_length": 1842.8750610351562, + "epoch": 0.15942857142857142, + "grad_norm": 0.6313254833221436, + "kl": 0.01605224609375, + "learning_rate": 2.081819207938249e-07, + "loss": -0.939, + "reward": 0.1666666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completion_length": 2166.541748046875, + "epoch": 0.16, + "grad_norm": 0.8989630341529846, + "kl": 0.015899658203125, + "learning_rate": 2.0528000059645995e-07, + "loss": -0.9391, + "reward": 0.1666666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 1435.3750915527344, + "epoch": 0.16057142857142856, + "grad_norm": 0.4465429186820984, + "kl": 0.01531982421875, + "learning_rate": 2.0241236834893028e-07, + "loss": 0.0025, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completion_length": 2514.5833740234375, + "epoch": 0.16114285714285714, + "grad_norm": 1.1687437295913696, + "kl": 0.02166748046875, + "learning_rate": 1.9957930928339772e-07, + "loss": -1.8717, + "reward": 0.1666666716337204, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completion_length": 2165.3333740234375, + "epoch": 0.16171428571428573, + "grad_norm": 0.9040661454200745, + "kl": 0.016571044921875, + "learning_rate": 1.96781105193165e-07, + "loss": -0.9278, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completion_length": 2579.9583740234375, + "epoch": 0.16228571428571428, + "grad_norm": 0.7441158890724182, + "kl": 0.01458740234375, + "learning_rate": 1.9401803440464654e-07, + "loss": -1.6716, + "reward": 0.375, + "reward_std": 0.23116150498390198, + "rewards/accuracy_reward": 0.375, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 3037.75, + "epoch": 0.16285714285714287, + "grad_norm": 0.6573392152786255, + "kl": 0.015625, + "learning_rate": 1.9129037174968505e-07, + "loss": -0.9305, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 2087.666748046875, + "epoch": 0.16342857142857142, + "grad_norm": 0.9549260139465332, + "kl": 0.017333984375, + "learning_rate": 1.8859838853821435e-07, + "loss": -2.6746, + "reward": 0.2500000074505806, + "reward_std": 0.3680921643972397, + "rewards/accuracy_reward": 0.2500000074505806, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completion_length": 1758.9166870117188, + "epoch": 0.164, + "grad_norm": 1.3493952751159668, + "kl": 0.0152587890625, + "learning_rate": 1.8594235253127372e-07, + "loss": -2.8729, + "reward": 0.291666679084301, + "reward_std": 0.3951295167207718, + "rewards/accuracy_reward": 0.291666679084301, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completion_length": 2302.3751220703125, + "epoch": 0.16457142857142856, + "grad_norm": 0.5830644965171814, + "kl": 0.0174560546875, + "learning_rate": 1.8332252791437486e-07, + "loss": -0.7415, + "reward": 0.2083333432674408, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 2104.8333740234375, + "epoch": 0.16514285714285715, + "grad_norm": 0.9054921269416809, + "kl": 0.0169677734375, + "learning_rate": 1.8073917527122385e-07, + "loss": -1.7252, + "reward": 0.1666666716337204, + "reward_std": 0.23899272084236145, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completion_length": 1549.2500915527344, + "epoch": 0.1657142857142857, + "grad_norm": 1.4962562322616577, + "kl": 0.02374267578125, + "learning_rate": 1.7819255155780238e-07, + "loss": -0.9334, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 2643.3751220703125, + "epoch": 0.1662857142857143, + "grad_norm": 0.8332542777061462, + "kl": 0.020294189453125, + "learning_rate": 1.7568291007680907e-07, + "loss": -1.4634, + "reward": 0.0833333358168602, + "reward_std": 0.20412413775920868, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completion_length": 2123.791748046875, + "epoch": 0.16685714285714287, + "grad_norm": 0.7176339030265808, + "kl": 0.0137939453125, + "learning_rate": 1.7321050045246455e-07, + "loss": -1.4564, + "reward": 0.0833333358168602, + "reward_std": 0.20412413775920868, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 1340.9167175292969, + "epoch": 0.16742857142857143, + "grad_norm": 1.2635085582733154, + "kl": 0.011871337890625, + "learning_rate": 1.7077556860568238e-07, + "loss": -2.4176, + "reward": 0.3333333544433117, + "reward_std": 0.3332235738635063, + "rewards/accuracy_reward": 0.3333333544433117, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completion_length": 2578.8333740234375, + "epoch": 0.168, + "grad_norm": 0.5623243451118469, + "kl": 0.0145263671875, + "learning_rate": 1.6837835672960831e-07, + "loss": -0.7386, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completion_length": 2309.2084350585938, + "epoch": 0.16857142857142857, + "grad_norm": 1.792176365852356, + "kl": 0.014251708984375, + "learning_rate": 1.6601910326552998e-07, + "loss": -1.6771, + "reward": 0.1250000037252903, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.1250000037252903, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completion_length": 2761.2083740234375, + "epoch": 0.16914285714285715, + "grad_norm": 0.5179618000984192, + "kl": 0.0174560546875, + "learning_rate": 1.6369804287916025e-07, + "loss": -0.74, + "reward": 0.0416666679084301, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 2832.916748046875, + "epoch": 0.1697142857142857, + "grad_norm": 1.062676191329956, + "kl": 0.0169677734375, + "learning_rate": 1.6141540643729612e-07, + "loss": -2.6165, + "reward": 0.2916666679084301, + "reward_std": 0.3602609559893608, + "rewards/accuracy_reward": 0.2916666679084301, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completion_length": 2174.5, + "epoch": 0.1702857142857143, + "grad_norm": 1.0952427387237549, + "kl": 0.015716552734375, + "learning_rate": 1.5917142098485503e-07, + "loss": -1.4785, + "reward": 0.25, + "reward_std": 0.20412413775920868, + "rewards/accuracy_reward": 0.25, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completion_length": 1648.291748046875, + "epoch": 0.17085714285714285, + "grad_norm": 0.9061567187309265, + "kl": 0.016937255859375, + "learning_rate": 1.5696630972229166e-07, + "loss": -1.6821, + "reward": 0.2083333395421505, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.2083333395421505, + "step": 299 + }, + { + "clip_ratio": 0.0, + "completion_length": 2321.9583740234375, + "epoch": 0.17142857142857143, + "grad_norm": 0.47703251242637634, + "kl": 0.01666259765625, + "learning_rate": 1.548002919833971e-07, + "loss": -0.7289, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 2111.7501220703125, + "epoch": 0.172, + "grad_norm": 0.47726908326148987, + "kl": 0.01434326171875, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 301 + }, + { + "clip_ratio": 0.0, + "completion_length": 1759.4166870117188, + "epoch": 0.17257142857142857, + "grad_norm": 0.6928355693817139, + "kl": 0.016265869140625, + "learning_rate": 1.5058639494795067e-07, + "loss": -0.9378, + "reward": 0.4166666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.4166666716337204, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completion_length": 2077.5, + "epoch": 0.17314285714285715, + "grad_norm": 0.6859722137451172, + "kl": 0.0133056640625, + "learning_rate": 1.485389347912525e-07, + "loss": -1.4857, + "reward": 0.2500000111758709, + "reward_std": 0.20412413775920868, + "rewards/accuracy_reward": 0.2500000111758709, + "step": 303 + }, + { + "clip_ratio": 0.0, + "completion_length": 2916.2501220703125, + "epoch": 0.1737142857142857, + "grad_norm": 0.9551180601119995, + "kl": 0.01416015625, + "learning_rate": 1.4653140639624066e-07, + "loss": -1.8796, + "reward": 0.1666666716337204, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completion_length": 2326.8750610351562, + "epoch": 0.1742857142857143, + "grad_norm": 0.44118639826774597, + "kl": 0.0216064453125, + "learning_rate": 1.4456400944391144e-07, + "loss": -0.7264, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completion_length": 1633.6666870117188, + "epoch": 0.17485714285714285, + "grad_norm": 0.9067421555519104, + "kl": 0.02105712890625, + "learning_rate": 1.4263693962354336e-07, + "loss": -1.4626, + "reward": 0.3333333544433117, + "reward_std": 0.20412413775920868, + "rewards/accuracy_reward": 0.3333333544433117, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completion_length": 2906.0833740234375, + "epoch": 0.17542857142857143, + "grad_norm": 0.49431222677230835, + "kl": 0.011474609375, + "learning_rate": 1.4075038861323302e-07, + "loss": -0.7356, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 307 + }, + { + "clip_ratio": 0.0, + "completion_length": 1410.7500610351562, + "epoch": 0.176, + "grad_norm": 0.8035596609115601, + "kl": 0.01629638671875, + "learning_rate": 1.3890454406082956e-07, + "loss": -0.9359, + "reward": 0.5833333432674408, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.5833333432674408, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completion_length": 3014.25, + "epoch": 0.17657142857142857, + "grad_norm": 0.2768433392047882, + "kl": 0.01849365234375, + "learning_rate": 1.3709958956526974e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 309 + }, + { + "clip_ratio": 0.0, + "completion_length": 3245.7083740234375, + "epoch": 0.17714285714285713, + "grad_norm": 0.7835389971733093, + "kl": 0.01654052734375, + "learning_rate": 1.353357046583165e-07, + "loss": -2.2183, + "reward": 0.1250000037252903, + "reward_std": 0.3061862140893936, + "rewards/accuracy_reward": 0.1250000037252903, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 2727.3333740234375, + "epoch": 0.1777142857142857, + "grad_norm": 0.48111966252326965, + "kl": 0.009368896484375, + "learning_rate": 1.3361306478670148e-07, + "loss": -0.9302, + "reward": 0.0833333358168602, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 311 + }, + { + "clip_ratio": 0.0, + "completion_length": 1927.541748046875, + "epoch": 0.1782857142857143, + "grad_norm": 0.5695589184761047, + "kl": 0.02117919921875, + "learning_rate": 1.3193184129467384e-07, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completion_length": 2037.0000610351562, + "epoch": 0.17885714285714285, + "grad_norm": 1.5638245344161987, + "kl": 0.017608642578125, + "learning_rate": 1.3029220140695756e-07, + "loss": -1.6713, + "reward": 0.125, + "reward_std": 0.23116151988506317, + "rewards/accuracy_reward": 0.125, + "step": 313 + }, + { + "clip_ratio": 0.0, + "completion_length": 2119.4584350585938, + "epoch": 0.17942857142857144, + "grad_norm": 0.8178718686103821, + "kl": 0.0159912109375, + "learning_rate": 1.2869430821211826e-07, + "loss": -1.6774, + "reward": 0.458333358168602, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.458333358168602, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completion_length": 2671.666748046875, + "epoch": 0.18, + "grad_norm": 0.7374725341796875, + "kl": 0.012298583984375, + "learning_rate": 1.2713832064634125e-07, + "loss": -1.6682, + "reward": 0.1250000037252903, + "reward_std": 0.23116151988506317, + "rewards/accuracy_reward": 0.1250000037252903, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completion_length": 1793.666748046875, + "epoch": 0.18057142857142858, + "grad_norm": 0.8555610775947571, + "kl": 0.0196533203125, + "learning_rate": 1.2562439347762275e-07, + "loss": -0.7343, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completion_length": 2572.6250610351562, + "epoch": 0.18114285714285713, + "grad_norm": 0.7458478808403015, + "kl": 0.012847900390625, + "learning_rate": 1.2415267729037608e-07, + "loss": -1.6769, + "reward": 0.1250000037252903, + "reward_std": 0.23116151988506317, + "rewards/accuracy_reward": 0.1250000037252903, + "step": 317 + }, + { + "clip_ratio": 0.0, + "completion_length": 1808.0000610351562, + "epoch": 0.18171428571428572, + "grad_norm": 1.242701530456543, + "kl": 0.017181396484375, + "learning_rate": 1.2272331847045313e-07, + "loss": -2.6198, + "reward": 0.3750000223517418, + "reward_std": 0.3602609485387802, + "rewards/accuracy_reward": 0.3750000223517418, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completion_length": 2072.875, + "epoch": 0.18228571428571427, + "grad_norm": 0.43984729051589966, + "kl": 0.01568603515625, + "learning_rate": 1.2133645919058418e-07, + "loss": -0.939, + "reward": 0.4166666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.4166666716337204, + "step": 319 + }, + { + "clip_ratio": 0.0, + "completion_length": 2183.375, + "epoch": 0.18285714285714286, + "grad_norm": 0.7200397849082947, + "kl": 0.017120361328125, + "learning_rate": 1.1999223739623666e-07, + "loss": -1.6787, + "reward": 0.125, + "reward_std": 0.23116150498390198, + "rewards/accuracy_reward": 0.125, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 3046.916748046875, + "epoch": 0.18342857142857144, + "grad_norm": 0.5004404187202454, + "kl": 0.016571044921875, + "learning_rate": 1.1869078679189393e-07, + "loss": -0.7381, + "reward": 0.0416666679084301, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 321 + }, + { + "clip_ratio": 0.0, + "completion_length": 1882.75, + "epoch": 0.184, + "grad_norm": 1.3577691316604614, + "kl": 0.0198974609375, + "learning_rate": 1.1743223682775649e-07, + "loss": -3.1577, + "reward": 0.2083333358168602, + "reward_std": 0.43528564274311066, + "rewards/accuracy_reward": 0.2083333358168602, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completion_length": 2563.2083740234375, + "epoch": 0.18457142857142858, + "grad_norm": 1.3319201469421387, + "kl": 0.023193359375, + "learning_rate": 1.1621671268686605e-07, + "loss": -3.1597, + "reward": 0.291666679084301, + "reward_std": 0.43528565764427185, + "rewards/accuracy_reward": 0.291666679084301, + "step": 323 + }, + { + "clip_ratio": 0.0, + "completion_length": 2025.666748046875, + "epoch": 0.18514285714285714, + "grad_norm": 0.5463172793388367, + "kl": 0.02020263671875, + "learning_rate": 1.1504433527265378e-07, + "loss": -1.4631, + "reward": 0.0833333358168602, + "reward_std": 0.20412413775920868, + "rewards/accuracy_reward": 0.0833333358168602, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completion_length": 2767.6251220703125, + "epoch": 0.18571428571428572, + "grad_norm": 0.6291061639785767, + "kl": 0.0208740234375, + "learning_rate": 1.1391522119691496e-07, + "loss": -0.7406, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completion_length": 1606.8750610351562, + "epoch": 0.18628571428571428, + "grad_norm": 1.8553162813186646, + "kl": 0.02337646484375, + "learning_rate": 1.1282948276820962e-07, + "loss": -1.9341, + "reward": 0.2916666865348816, + "reward_std": 0.26603007316589355, + "rewards/accuracy_reward": 0.2916666865348816, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completion_length": 2260.25, + "epoch": 0.18685714285714286, + "grad_norm": 0.6469095945358276, + "kl": 0.014312744140625, + "learning_rate": 1.1178722798069215e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 327 + }, + { + "clip_ratio": 0.0, + "completion_length": 2050.666748046875, + "epoch": 0.18742857142857142, + "grad_norm": 0.7297951579093933, + "kl": 0.017822265625, + "learning_rate": 1.10788560503369e-07, + "loss": -0.9389, + "reward": 0.1666666716337204, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.1666666716337204, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completion_length": 2868.33349609375, + "epoch": 0.188, + "grad_norm": 1.0545990467071533, + "kl": 0.01544189453125, + "learning_rate": 1.0983357966978745e-07, + "loss": -2.73, + "reward": 0.2916666679084301, + "reward_std": 0.37592335790395737, + "rewards/accuracy_reward": 0.2916666679084301, + "step": 329 + }, + { + "clip_ratio": 0.0, + "completion_length": 2244.7501220703125, + "epoch": 0.18857142857142858, + "grad_norm": 1.0005416870117188, + "kl": 0.017578125, + "learning_rate": 1.0892238046815527e-07, + "loss": -2.2275, + "reward": 0.291666679084301, + "reward_std": 0.3061862140893936, + "rewards/accuracy_reward": 0.291666679084301, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 1698.3333435058594, + "epoch": 0.18914285714285714, + "grad_norm": 0.24079620838165283, + "kl": 0.01593017578125, + "learning_rate": 1.0805505353189254e-07, + "loss": 0.0029, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "step": 331 + }, + { + "clip_ratio": 0.0, + "completion_length": 1608.1667175292969, + "epoch": 0.18971428571428572, + "grad_norm": 0.5765926837921143, + "kl": 0.013397216796875, + "learning_rate": 1.0723168513061665e-07, + "loss": -0.7406, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completion_length": 2000.416748046875, + "epoch": 0.19028571428571428, + "grad_norm": 0.4461122751235962, + "kl": 0.01611328125, + "learning_rate": 1.0645235716156168e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 333 + }, + { + "clip_ratio": 0.0, + "completion_length": 2835.75, + "epoch": 0.19085714285714286, + "grad_norm": 0.9358536005020142, + "kl": 0.01904296875, + "learning_rate": 1.0571714714143197e-07, + "loss": -1.6761, + "reward": 0.2083333432674408, + "reward_std": 0.23116150498390198, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completion_length": 1533.125, + "epoch": 0.19142857142857142, + "grad_norm": 0.58393394947052, + "kl": 0.0191650390625, + "learning_rate": 1.0502612819869216e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completion_length": 1860.0833740234375, + "epoch": 0.192, + "grad_norm": 0.9539235234260559, + "kl": 0.0213623046875, + "learning_rate": 1.0437936906629334e-07, + "loss": -2.6119, + "reward": 0.2083333358168602, + "reward_std": 0.3602609485387802, + "rewards/accuracy_reward": 0.2083333358168602, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 2359.9584350585938, + "epoch": 0.19257142857142856, + "grad_norm": 0.591833233833313, + "kl": 0.020263671875, + "learning_rate": 1.0377693407483638e-07, + "loss": -0.9325, + "reward": 0.4166666865348816, + "reward_std": 0.12909944355487823, + "rewards/accuracy_reward": 0.4166666865348816, + "step": 337 + }, + { + "clip_ratio": 0.0, + "completion_length": 2456.5834350585938, + "epoch": 0.19314285714285714, + "grad_norm": 1.7795339822769165, + "kl": 0.01800537109375, + "learning_rate": 1.032188831461732e-07, + "loss": -3.5502, + "reward": 0.375, + "reward_std": 0.48936039209365845, + "rewards/accuracy_reward": 0.375, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completion_length": 2746.6251220703125, + "epoch": 0.19371428571428573, + "grad_norm": 0.5191667675971985, + "kl": 0.0203857421875, + "learning_rate": 1.0270527178744664e-07, + "loss": -0.7381, + "reward": 0.0416666679084301, + "reward_std": 0.10206207633018494, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 339 + }, + { + "clip_ratio": 0.0, + "completion_length": 1982.0000610351562, + "epoch": 0.19428571428571428, + "grad_norm": 0.5445987582206726, + "kl": 0.01641845703125, + "learning_rate": 1.0223615108556937e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 1073.125, + "epoch": 0.19485714285714287, + "grad_norm": 1.0652012825012207, + "kl": 0.01904296875, + "learning_rate": 1.0181156770214242e-07, + "loss": -1.9294, + "reward": 0.2083333358168602, + "reward_std": 0.26603008806705475, + "rewards/accuracy_reward": 0.2083333358168602, + "step": 341 + }, + { + "clip_ratio": 0.0, + "completion_length": 1901.2917175292969, + "epoch": 0.19542857142857142, + "grad_norm": 1.262035846710205, + "kl": 0.016754150390625, + "learning_rate": 1.0143156386881408e-07, + "loss": -1.8793, + "reward": 0.2500000074505806, + "reward_std": 0.25819888710975647, + "rewards/accuracy_reward": 0.2500000074505806, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completion_length": 1721.8750610351562, + "epoch": 0.196, + "grad_norm": 0.8769494295120239, + "kl": 0.02349853515625, + "learning_rate": 1.0109617738307911e-07, + "loss": -0.7405, + "reward": 0.2083333432674408, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.2083333432674408, + "step": 343 + }, + { + "clip_ratio": 0.0, + "completion_length": 2183.2084350585938, + "epoch": 0.19657142857142856, + "grad_norm": 2.0577425956726074, + "kl": 0.01995849609375, + "learning_rate": 1.0080544160451918e-07, + "loss": -2.7138, + "reward": 0.2916666679084301, + "reward_std": 0.37592336535453796, + "rewards/accuracy_reward": 0.2916666679084301, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completion_length": 1418.0833435058594, + "epoch": 0.19714285714285715, + "grad_norm": 0.6003240942955017, + "kl": 0.017822265625, + "learning_rate": 1.0055938545148495e-07, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completion_length": 2902.291748046875, + "epoch": 0.1977142857142857, + "grad_norm": 0.4958335757255554, + "kl": 0.01824951171875, + "learning_rate": 1.0035803339821934e-07, + "loss": -0.7393, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completion_length": 2797.2083740234375, + "epoch": 0.1982857142857143, + "grad_norm": 0.5800639390945435, + "kl": 0.012176513671875, + "learning_rate": 1.002014054724235e-07, + "loss": -1.6757, + "reward": 0.1250000037252903, + "reward_std": 0.23116151243448257, + "rewards/accuracy_reward": 0.1250000037252903, + "step": 347 + }, + { + "clip_ratio": 0.0, + "completion_length": 2299.666748046875, + "epoch": 0.19885714285714284, + "grad_norm": 0.43301284313201904, + "kl": 0.02374267578125, + "learning_rate": 1.0008951725326441e-07, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 3268.9583740234375, + "epoch": 0.19942857142857143, + "grad_norm": 0.4114702641963959, + "kl": 0.0225830078125, + "learning_rate": 1.0002237986982564e-07, + "loss": -0.7369, + "reward": 0.0416666679084301, + "reward_std": 0.10206206887960434, + "rewards/accuracy_reward": 0.0416666679084301, + "step": 349 + }, + { + "clip_ratio": 0.0, + "completion_length": 1340.791748046875, + "epoch": 0.2, + "grad_norm": 0.5905561447143555, + "kl": 0.017578125, + "learning_rate": 1e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "step": 350 + }, + { + "epoch": 0.2, + "step": 350, + "total_flos": 0.0, + "train_loss": -1.2733016510141806, + "train_runtime": 18317.2078, + "train_samples_per_second": 0.459, + "train_steps_per_second": 0.019 + } + ], + "logging_steps": 1, + "max_steps": 350, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..78b880a --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05966da51319b81bd97dcc81fdc93f207482ad3ca17fbcb04291041e21c945e1 +size 8248