From a602a75c1147a954176b312cf3681b12d4a454f0 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Tue, 16 Jun 2026 17:50:18 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: kangdawei/MMR-Sigmoid-DAPO-7B Source: Original Platform --- .gitattributes | 37 + README.md | 70 + adapter/README.md | 209 + adapter/adapter_config.json | 46 + adapter/adapter_model.safetensors | 3 + adapter/chat_template.jinja | 1 + adapter/special_tokens_map.json | 23 + adapter/tokenizer.json | 3 + adapter/tokenizer_config.json | 194 + adapter/training_args.bin | 3 + adapter_config.json | 46 + adapter_model.safetensors | 3 + all_results.json | 8 + chat_template.jinja | 1 + config.json | 59 + generation_config.json | 9 + model-00001-of-00004.safetensors | 3 + model-00002-of-00004.safetensors | 3 + model-00003-of-00004.safetensors | 3 + model-00004-of-00004.safetensors | 3 + model.safetensors.index.json | 347 + special_tokens_map.json | 23 + tokenizer.json | 3 + tokenizer_config.json | 194 + train_results.json | 8 + trainer_state.json | 11543 ++++++++++++++++++++++++++++ training_args.bin | 3 + 27 files changed, 12848 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 adapter/README.md create mode 100644 adapter/adapter_config.json create mode 100644 adapter/adapter_model.safetensors create mode 100644 adapter/chat_template.jinja create mode 100644 adapter/special_tokens_map.json create mode 100644 adapter/tokenizer.json create mode 100644 adapter/tokenizer_config.json create mode 100644 adapter/training_args.bin create mode 100644 adapter_config.json create mode 100644 adapter_model.safetensors create mode 100644 all_results.json create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model-00001-of-00004.safetensors create mode 100644 model-00002-of-00004.safetensors create mode 100644 model-00003-of-00004.safetensors create mode 100644 model-00004-of-00004.safetensors create mode 100644 model.safetensors.index.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_results.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..b19d564 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,37 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +adapter/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..caa5b82 --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +datasets: knoveleng/open-rs +library_name: transformers +model_name: MMR-Sigmoid-DAPO-7B +tags: +- generated_from_trainer +- open-r1 +- dapo +- trl +licence: license +--- + +# Model Card for MMR-Sigmoid-DAPO-7B + +This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) on the [knoveleng/open-rs](https://huggingface.co/datasets/knoveleng/open-rs) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="kangdawei/MMR-Sigmoid-DAPO-7B", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + + + + +This model was trained with DAPO, a method introduced in [DAPO: An Open-Source LLM Reinforcement Learning System at Scale](https://huggingface.co/papers/2503.14476). + +### Framework versions + +- TRL: 0.16.0.dev0 +- Transformers: 4.57.1 +- Pytorch: 2.5.1 +- Datasets: 3.2.0 +- Tokenizers: 0.22.1 + +## Citations + +Cite DAPO as: + +```bibtex +@article{yu2025dapo, + title = {{DAPO: An Open-Source LLM Reinforcement Learning System at Scale}}, + author = {Qiying Yu and Zheng Zhang and others}, + year = 2025, + eprint = {arXiv:2503.14476}, +} + +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/adapter/README.md b/adapter/README.md new file mode 100644 index 0000000..6cad5a9 --- /dev/null +++ b/adapter/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +- dapo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/adapter/adapter_config.json b/adapter/adapter_config.json new file mode 100644 index 0000000..49c05ef --- /dev/null +++ b/adapter/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "o_proj", + "gate_proj", + "q_proj", + "up_proj", + "v_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter/adapter_model.safetensors b/adapter/adapter_model.safetensors new file mode 100644 index 0000000..4eb2693 --- /dev/null +++ b/adapter/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f73ce5bb2236b4cdce3e19a4b94c89f63a08ff478d69f9c59c55273bb37335ba +size 323014560 diff --git a/adapter/chat_template.jinja b/adapter/chat_template.jinja new file mode 100644 index 0000000..c2066bd --- /dev/null +++ b/adapter/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/adapter/special_tokens_map.json b/adapter/special_tokens_map.json new file mode 100644 index 0000000..1d385d6 --- /dev/null +++ b/adapter/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/adapter/tokenizer.json b/adapter/tokenizer.json new file mode 100644 index 0000000..e7cd2c1 --- /dev/null +++ b/adapter/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a +size 11422959 diff --git a/adapter/tokenizer_config.json b/adapter/tokenizer_config.json new file mode 100644 index 0000000..d252dd4 --- /dev/null +++ b/adapter/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/adapter/training_args.bin b/adapter/training_args.bin new file mode 100644 index 0000000..6392141 --- /dev/null +++ b/adapter/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b630f1da3b8a9f5dc75e704d29f8dbf0464a2b9b7c42e6843ec187c68bc4ed7 +size 8824 diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000..49c05ef --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "o_proj", + "gate_proj", + "q_proj", + "up_proj", + "v_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000..ba3b119 --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60d95b10b6e140a9626a7058d5038528f2ff80148dc4569b881db56052046509 +size 40 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..a83392c --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": 0.0046342451156378955, + "train_runtime": 100887.1622, + "train_samples": 7000, + "train_samples_per_second": 0.238, + "train_steps_per_second": 0.005 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..c2066bd --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..51a2fb1 --- /dev/null +++ b/config.json @@ -0,0 +1,59 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 28, + "model_type": "qwen2", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "4.57.1", + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 152064 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..3f29992 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 151646, + "do_sample": true, + "eos_token_id": 151643, + "temperature": 0.6, + "top_p": 0.95, + "transformers_version": "4.57.1" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..d6fa90b --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d059cbca571bc6e215c55a9b8f3464d9ff00a7215c40c328e6319b0d60ffe620 +size 4877660776 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..f229a5d --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c007eec268c04a2cffbaea53997baf185d11c93774160d62603b99d1f5144c9 +size 4932751008 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..efb5a82 --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb555bea9309829e77802abee4f6203cb7aa14d459c372d58514d2ec3833aa36 +size 4330865200 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..ef3b83a --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7946740ab848b4a02904f05bff81211bf9600fc20bfad39bbcc8d1703a40ce1c +size 1089994880 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..5b2b8b5 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,347 @@ +{ + "metadata": { + "total_parameters": 7615616512, + "total_size": 15231233024 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00003-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..1d385d6 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1a2db24 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 +size 11422778 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..d252dd4 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..a83392c --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": 0.0046342451156378955, + "train_runtime": 100887.1622, + "train_samples": 7000, + "train_samples_per_second": 0.238, + "train_steps_per_second": 0.005 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..2f5c76c --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,11543 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5714285714285714, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_fraction": 0.0, + "completion_length": 2253.854206085205, + "epoch": 0.001142857142857143, + "grad_norm": 0.029786353930830956, + "kl": 0.0, + "lambda_div_used": 0.6170438528060913, + "learning_rate": 0.0, + "loss": -0.0476, + "reward": 0.09989889524877071, + "reward_after_mean": 0.09989889524877071, + "reward_after_std": 0.6247774921357632, + "reward_before_mean": 0.5353203006088734, + "reward_before_std": 0.5411310354247689, + "reward_change_max": 0.0, + "reward_change_mean": -0.4354214407503605, + "reward_change_min": -0.6615581586956978, + "reward_change_std": 0.2600514395162463, + "reward_std": 0.6247775163501501, + "rewards/accuracy_reward": 0.37500000931322575, + "rewards/cosine_scaled_reward": 0.16032031644135714, + "step": 1 + }, + { + "clip_fraction": 0.0, + "completion_length": 2566.395854949951, + "epoch": 0.002285714285714286, + "grad_norm": 0.025455903261899948, + "kl": 0.0, + "lambda_div_used": 0.6156510338187218, + "learning_rate": 5e-08, + "loss": 0.0349, + "reward": 0.10292071849107742, + "reward_after_mean": 0.10292071849107742, + "reward_after_std": 0.598213616758585, + "reward_before_mean": 0.5439198296517134, + "reward_before_std": 0.5335724893957376, + "reward_change_max": 0.0, + "reward_change_mean": -0.440999086946249, + "reward_change_min": -0.6536596268415451, + "reward_change_std": 0.2629614323377609, + "reward_std": 0.5982136316597462, + "rewards/accuracy_reward": 0.41666667722165585, + "rewards/cosine_scaled_reward": 0.12725313939154148, + "step": 2 + }, + { + "clip_fraction": 0.0, + "completion_length": 2808.2083740234375, + "epoch": 0.0034285714285714284, + "grad_norm": 0.026795223355293274, + "kl": 0.00017423927783966064, + "lambda_div_used": 0.599299855530262, + "learning_rate": 1e-07, + "loss": -0.011, + "reward": -0.23149854317307472, + "reward_after_mean": -0.23149854317307472, + "reward_after_std": 0.4924859032034874, + "reward_before_mean": 0.05049763061106205, + "reward_before_std": 0.4575129607692361, + "reward_change_max": 0.0, + "reward_change_mean": -0.28199618123471737, + "reward_change_min": -0.46816934645175934, + "reward_change_std": 0.17736693751066923, + "reward_std": 0.49248590879142284, + "rewards/accuracy_reward": 0.14583333767950535, + "rewards/cosine_scaled_reward": -0.09533570008352399, + "step": 3 + }, + { + "clip_fraction": 0.0, + "completion_length": 1537.4583435058594, + "epoch": 0.004571428571428572, + "grad_norm": 0.027395043522119522, + "kl": 8.840858936309814e-05, + "lambda_div_used": 0.6135994121432304, + "learning_rate": 1.5e-07, + "loss": 0.0364, + "reward": 0.19573484233114868, + "reward_after_mean": 0.19573484233114868, + "reward_after_std": 0.5626182612031698, + "reward_before_mean": 0.6533232685178518, + "reward_before_std": 0.5161786610260606, + "reward_change_max": 0.0, + "reward_change_mean": -0.4575884137302637, + "reward_change_min": -0.662933848798275, + "reward_change_std": 0.26597225945442915, + "reward_std": 0.5626182779669762, + "rewards/accuracy_reward": 0.45833334885537624, + "rewards/cosine_scaled_reward": 0.19498991407454014, + "step": 4 + }, + { + "clip_fraction": 0.0, + "completion_length": 2930.541748046875, + "epoch": 0.005714285714285714, + "grad_norm": 0.021351713687181473, + "kl": 0.00014585256576538086, + "lambda_div_used": 0.618914432823658, + "learning_rate": 2e-07, + "loss": 0.0298, + "reward": -0.07733920076861978, + "reward_after_mean": -0.07733920076861978, + "reward_after_std": 0.6453567277640104, + "reward_before_mean": 0.276840849313885, + "reward_before_std": 0.5498588550835848, + "reward_change_max": 0.0, + "reward_change_mean": -0.3541800267994404, + "reward_change_min": -0.5702032893896103, + "reward_change_std": 0.21250940579921007, + "reward_std": 0.6453567445278168, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/cosine_scaled_reward": 0.0060075074434280396, + "step": 5 + }, + { + "clip_fraction": 0.0, + "completion_length": 2457.187515258789, + "epoch": 0.006857142857142857, + "grad_norm": 0.03508686274290085, + "kl": 0.00010536611080169678, + "lambda_div_used": 0.6270971074700356, + "learning_rate": 2.5e-07, + "loss": -0.0326, + "reward": -0.08803003467619419, + "reward_after_mean": -0.08803003467619419, + "reward_after_std": 0.6014832425862551, + "reward_before_mean": 0.21033997228369117, + "reward_before_std": 0.5929712019860744, + "reward_change_max": 0.0, + "reward_change_mean": -0.29837000742554665, + "reward_change_min": -0.5160593837499619, + "reward_change_std": 0.2064626282081008, + "reward_std": 0.601483253762126, + "rewards/accuracy_reward": 0.22916666977107525, + "rewards/cosine_scaled_reward": -0.01882670260965824, + "step": 6 + }, + { + "clip_fraction": 0.0, + "completion_length": 2581.604263305664, + "epoch": 0.008, + "grad_norm": 0.02246660739183426, + "kl": 0.00012472271919250488, + "lambda_div_used": 0.6219364404678345, + "learning_rate": 3e-07, + "loss": -0.0131, + "reward": -0.06724199093878269, + "reward_after_mean": -0.06724199093878269, + "reward_after_std": 0.5799425262957811, + "reward_before_mean": 0.2561297030188143, + "reward_before_std": 0.5679045412689447, + "reward_change_max": 0.0, + "reward_change_mean": -0.3233717121183872, + "reward_change_min": -0.5483178608119488, + "reward_change_std": 0.213481605052948, + "reward_std": 0.5799425337463617, + "rewards/accuracy_reward": 0.27083334140479565, + "rewards/cosine_scaled_reward": -0.014703631401062012, + "step": 7 + }, + { + "clip_fraction": 0.0, + "completion_length": 1865.0625228881836, + "epoch": 0.009142857142857144, + "grad_norm": 0.0264554712921381, + "kl": 7.349252700805664e-05, + "lambda_div_used": 0.6559017673134804, + "learning_rate": 3.5e-07, + "loss": 0.0012, + "reward": 0.30751039180904627, + "reward_after_mean": 0.30751039180904627, + "reward_after_std": 0.768000740557909, + "reward_before_mean": 0.7620646432042122, + "reward_before_std": 0.7257685504155234, + "reward_change_max": 0.0, + "reward_change_mean": -0.4545542187988758, + "reward_change_min": -0.6947779208421707, + "reward_change_std": 0.28947674110531807, + "reward_std": 0.7680007480084896, + "rewards/accuracy_reward": 0.5208333469927311, + "rewards/cosine_scaled_reward": 0.24123129644431174, + "step": 8 + }, + { + "clip_fraction": 0.0, + "completion_length": 2611.187515258789, + "epoch": 0.010285714285714285, + "grad_norm": 0.02845916524529457, + "kl": 0.00014799833297729492, + "lambda_div_used": 0.6612376719713211, + "learning_rate": 4e-07, + "loss": -0.0083, + "reward": 0.022774726152420044, + "reward_after_mean": 0.022774726152420044, + "reward_after_std": 0.7959331478923559, + "reward_before_mean": 0.3162382678128779, + "reward_before_std": 0.7569107804447412, + "reward_change_max": 0.0, + "reward_change_mean": -0.29346355609595776, + "reward_change_min": -0.4966486766934395, + "reward_change_std": 0.20045297034084797, + "reward_std": 0.7959331627935171, + "rewards/accuracy_reward": 0.25000000186264515, + "rewards/cosine_scaled_reward": 0.06623826455324888, + "step": 9 + }, + { + "clip_fraction": 0.0, + "completion_length": 2326.8541870117188, + "epoch": 0.011428571428571429, + "grad_norm": 0.02597963623702526, + "kl": 0.0001014266163110733, + "lambda_div_used": 0.6026739403605461, + "learning_rate": 4.5e-07, + "loss": 0.012, + "reward": 0.08584612235426903, + "reward_after_mean": 0.08584612235426903, + "reward_after_std": 0.5247625019401312, + "reward_before_mean": 0.5338415652513504, + "reward_before_std": 0.4742008354514837, + "reward_change_max": 0.0, + "reward_change_mean": -0.4479954708367586, + "reward_change_min": -0.6653710156679153, + "reward_change_std": 0.2726733274757862, + "reward_std": 0.5247625187039375, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/cosine_scaled_reward": 0.13800824619829655, + "step": 10 + }, + { + "clip_fraction": 0.0, + "completion_length": 3257.875045776367, + "epoch": 0.012571428571428572, + "grad_norm": 0.01903906650841236, + "kl": 0.00015848875045776367, + "lambda_div_used": 0.6304982751607895, + "learning_rate": 5e-07, + "loss": 0.037, + "reward": -0.19499589689075947, + "reward_after_mean": -0.19499589689075947, + "reward_after_std": 0.6491430383175611, + "reward_before_mean": 0.052283127792179585, + "reward_before_std": 0.5964901968836784, + "reward_change_max": 0.0, + "reward_change_mean": -0.24727902933955193, + "reward_change_min": -0.40677722357213497, + "reward_change_std": 0.14844622276723385, + "reward_std": 0.649143049493432, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/cosine_scaled_reward": -0.11438353173434734, + "step": 11 + }, + { + "clip_fraction": 0.0, + "completion_length": 1922.6458587646484, + "epoch": 0.013714285714285714, + "grad_norm": 0.027084212750196457, + "kl": 0.00012683868408203125, + "lambda_div_used": 0.6053123474121094, + "learning_rate": 5.5e-07, + "loss": 0.0637, + "reward": -0.10656415252014995, + "reward_after_mean": -0.10656415252014995, + "reward_after_std": 0.5768749956041574, + "reward_before_mean": 0.25147354789078236, + "reward_before_std": 0.4855317808687687, + "reward_change_max": 0.0, + "reward_change_mean": -0.35803768038749695, + "reward_change_min": -0.5767498537898064, + "reward_change_std": 0.21485394705086946, + "reward_std": 0.5768750291317701, + "rewards/accuracy_reward": 0.25000000186264515, + "rewards/cosine_scaled_reward": 0.0014735234435647726, + "step": 12 + }, + { + "clip_fraction": 0.0, + "completion_length": 2585.979217529297, + "epoch": 0.014857142857142857, + "grad_norm": 0.02171475626528263, + "kl": 0.00012777745723724365, + "lambda_div_used": 0.5844379514455795, + "learning_rate": 6e-07, + "loss": 0.0567, + "reward": -0.11025732010602951, + "reward_after_mean": -0.11025732010602951, + "reward_after_std": 0.4723772555589676, + "reward_before_mean": 0.2753620855510235, + "reward_before_std": 0.39071971736848354, + "reward_change_max": 0.0, + "reward_change_mean": -0.3856194168329239, + "reward_change_min": -0.5897834450006485, + "reward_change_std": 0.2299499223008752, + "reward_std": 0.4723772667348385, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/cosine_scaled_reward": 0.004528738558292389, + "step": 13 + }, + { + "clip_fraction": 0.0, + "completion_length": 2412.1250534057617, + "epoch": 0.016, + "grad_norm": 0.024103164672851562, + "kl": 0.00015282630920410156, + "lambda_div_used": 0.6067279502749443, + "learning_rate": 6.5e-07, + "loss": -0.0015, + "reward": -0.08361193258315325, + "reward_after_mean": -0.08361193258315325, + "reward_after_std": 0.5700989812612534, + "reward_before_mean": 0.27934680134058, + "reward_before_std": 0.49509103409945965, + "reward_change_max": 0.0, + "reward_change_mean": -0.36295875161886215, + "reward_change_min": -0.6321466080844402, + "reward_change_std": 0.2297183210030198, + "reward_std": 0.570098988711834, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/cosine_scaled_reward": 0.0085134650580585, + "step": 14 + }, + { + "clip_fraction": 0.0, + "completion_length": 2612.5833778381348, + "epoch": 0.017142857142857144, + "grad_norm": 0.02622115984559059, + "kl": 0.00012434273958206177, + "lambda_div_used": 0.5387627929449081, + "learning_rate": 7e-07, + "loss": -0.0354, + "reward": -0.03696875274181366, + "reward_after_mean": -0.03696875274181366, + "reward_after_std": 0.3815008979290724, + "reward_before_mean": 0.5313108433037996, + "reward_before_std": 0.17025252804160118, + "reward_change_max": 0.0, + "reward_change_mean": -0.5682796090841293, + "reward_change_min": -0.7619944997131824, + "reward_change_std": 0.2890887148678303, + "reward_std": 0.3815009109675884, + "rewards/accuracy_reward": 0.375, + "rewards/cosine_scaled_reward": 0.15631086938083172, + "step": 15 + }, + { + "clip_fraction": 0.0, + "completion_length": 3487.4583435058594, + "epoch": 0.018285714285714287, + "grad_norm": 0.018314050510525703, + "kl": 0.00019049644470214844, + "lambda_div_used": 0.5788475871086121, + "learning_rate": 7.5e-07, + "loss": 0.0111, + "reward": -0.1648220755159855, + "reward_after_mean": -0.1648220755159855, + "reward_after_std": 0.3886314034461975, + "reward_before_mean": 0.18453767150640488, + "reward_before_std": 0.3589506670832634, + "reward_change_max": 0.0, + "reward_change_mean": -0.34935975447297096, + "reward_change_min": -0.5136087462306023, + "reward_change_std": 0.2073321659117937, + "reward_std": 0.38863140903413296, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/cosine_scaled_reward": -0.023795653134584427, + "step": 16 + }, + { + "clip_fraction": 0.0, + "completion_length": 1923.895851135254, + "epoch": 0.019428571428571427, + "grad_norm": 0.03498728573322296, + "kl": 0.00010513514280319214, + "lambda_div_used": 0.6301147192716599, + "learning_rate": 8e-07, + "loss": 0.0516, + "reward": 0.24809654615819454, + "reward_after_mean": 0.24809654615819454, + "reward_after_std": 0.7150995936244726, + "reward_before_mean": 0.749116275459528, + "reward_before_std": 0.5986730419099331, + "reward_change_max": 0.0, + "reward_change_mean": -0.5010197218507528, + "reward_change_min": -0.7233205642551184, + "reward_change_std": 0.2854560799896717, + "reward_std": 0.7150996085256338, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/cosine_scaled_reward": 0.2282829141477123, + "step": 17 + }, + { + "clip_fraction": 0.0, + "completion_length": 2648.3334197998047, + "epoch": 0.02057142857142857, + "grad_norm": 0.019363267347216606, + "kl": 0.00012411177158355713, + "lambda_div_used": 0.5733960121870041, + "learning_rate": 8.499999999999999e-07, + "loss": 0.0156, + "reward": -4.522036761045456e-05, + "reward_after_mean": -4.522036761045456e-05, + "reward_after_std": 0.491512268781662, + "reward_before_mean": 0.4940829328261316, + "reward_before_std": 0.33302280586212873, + "reward_change_max": 0.0, + "reward_change_mean": -0.49412815272808075, + "reward_change_min": -0.6739438865333796, + "reward_change_std": 0.25992031022906303, + "reward_std": 0.4915122911334038, + "rewards/accuracy_reward": 0.37500000558793545, + "rewards/cosine_scaled_reward": 0.11908289603888988, + "step": 18 + }, + { + "clip_fraction": 0.0, + "completion_length": 2037.93754196167, + "epoch": 0.021714285714285714, + "grad_norm": 0.0254252627491951, + "kl": 0.00010475516319274902, + "lambda_div_used": 0.5972657427191734, + "learning_rate": 9e-07, + "loss": -0.0277, + "reward": 0.3277764454251155, + "reward_after_mean": 0.3277764454251155, + "reward_after_std": 0.7013481389731169, + "reward_before_mean": 0.9843392036855221, + "reward_before_std": 0.44702923856675625, + "reward_change_max": 0.0, + "reward_change_mean": -0.6565627679228783, + "reward_change_min": -0.9309929609298706, + "reward_change_std": 0.3530767587944865, + "reward_std": 0.7013481538742781, + "rewards/accuracy_reward": 0.6041666697710752, + "rewards/cosine_scaled_reward": 0.3801724927034229, + "step": 19 + }, + { + "clip_fraction": 0.0, + "completion_length": 1404.291706085205, + "epoch": 0.022857142857142857, + "grad_norm": 0.028516631573438644, + "kl": 5.914270877838135e-05, + "lambda_div_used": 0.6129282414913177, + "learning_rate": 9.499999999999999e-07, + "loss": 0.0391, + "reward": 0.26722301356494427, + "reward_after_mean": 0.26722301356494427, + "reward_after_std": 0.6774719897657633, + "reward_before_mean": 0.8377129100263119, + "reward_before_std": 0.5324758047936484, + "reward_change_max": 0.0, + "reward_change_mean": -0.5704899430274963, + "reward_change_min": -0.8648187033832073, + "reward_change_std": 0.3376352610066533, + "reward_std": 0.6774720121175051, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/cosine_scaled_reward": 0.29604623932391405, + "step": 20 + }, + { + "clip_fraction": 0.0, + "completion_length": 2424.6042137145996, + "epoch": 0.024, + "grad_norm": 0.032928258180618286, + "kl": 0.000138014554977417, + "lambda_div_used": 0.6448317095637321, + "learning_rate": 1e-06, + "loss": 0.0361, + "reward": 0.23116276413202286, + "reward_after_mean": 0.23116276413202286, + "reward_after_std": 0.7203101813793182, + "reward_before_mean": 0.6635394699405879, + "reward_before_std": 0.6762269856408238, + "reward_change_max": 0.0, + "reward_change_mean": -0.4323767013847828, + "reward_change_min": -0.7010968029499054, + "reward_change_std": 0.2771869823336601, + "reward_std": 0.7203102335333824, + "rewards/accuracy_reward": 0.43750000931322575, + "rewards/cosine_scaled_reward": 0.22603945806622505, + "step": 21 + }, + { + "clip_fraction": 0.0, + "completion_length": 1410.8750381469727, + "epoch": 0.025142857142857144, + "grad_norm": 0.03686099499464035, + "kl": 0.00010958313941955566, + "lambda_div_used": 0.5709755718708038, + "learning_rate": 9.99931462820376e-07, + "loss": -0.0637, + "reward": -0.13723512832075357, + "reward_after_mean": -0.13723512832075357, + "reward_after_std": 0.4940304774791002, + "reward_before_mean": 0.30008178018033504, + "reward_before_std": 0.32625696901232004, + "reward_change_max": 0.0, + "reward_change_mean": -0.4373169243335724, + "reward_change_min": -0.6162486486136913, + "reward_change_std": 0.23701479192823172, + "reward_std": 0.4940304830670357, + "rewards/accuracy_reward": 0.31250000186264515, + "rewards/cosine_scaled_reward": -0.012418218422681093, + "step": 22 + }, + { + "clip_fraction": 0.0, + "completion_length": 2354.062568664551, + "epoch": 0.026285714285714287, + "grad_norm": 0.025602027773857117, + "kl": 0.00011620670557022095, + "lambda_div_used": 0.6488568410277367, + "learning_rate": 9.997258721585931e-07, + "loss": 0.0419, + "reward": -0.012333650141954422, + "reward_after_mean": -0.012333650141954422, + "reward_after_std": 0.698169419541955, + "reward_before_mean": 0.27921401464845985, + "reward_before_std": 0.6954333996400237, + "reward_change_max": 0.0, + "reward_change_mean": -0.29154767468571663, + "reward_change_min": -0.5527428761124611, + "reward_change_std": 0.2108509410172701, + "reward_std": 0.6981694512069225, + "rewards/accuracy_reward": 0.27083333767950535, + "rewards/cosine_scaled_reward": 0.008380686864256859, + "step": 23 + }, + { + "clip_fraction": 0.0, + "completion_length": 1993.1041870117188, + "epoch": 0.027428571428571427, + "grad_norm": 0.03096413053572178, + "kl": 8.243322372436523e-05, + "lambda_div_used": 0.6519145146012306, + "learning_rate": 9.993832906395582e-07, + "loss": 0.0795, + "reward": 0.11854812642559409, + "reward_after_mean": 0.11854812642559409, + "reward_after_std": 0.7567200511693954, + "reward_before_mean": 0.49915426783263683, + "reward_before_std": 0.7109423782676458, + "reward_change_max": 0.0, + "reward_change_mean": -0.38060615211725235, + "reward_change_min": -0.7067882716655731, + "reward_change_std": 0.26373046822845936, + "reward_std": 0.7567200735211372, + "rewards/accuracy_reward": 0.3750000037252903, + "rewards/cosine_scaled_reward": 0.12415427155792713, + "step": 24 + }, + { + "clip_fraction": 0.0, + "completion_length": 2234.8333435058594, + "epoch": 0.02857142857142857, + "grad_norm": 0.022234002128243446, + "kl": 0.00014199316501617432, + "lambda_div_used": 0.6245157197117805, + "learning_rate": 9.989038226169207e-07, + "loss": 0.0107, + "reward": 0.08800357580184937, + "reward_after_mean": 0.08800357580184937, + "reward_after_std": 0.5656109545379877, + "reward_before_mean": 0.46775088645517826, + "reward_before_std": 0.5775847099721432, + "reward_change_max": 0.0, + "reward_change_mean": -0.37974734231829643, + "reward_change_min": -0.6116136200726032, + "reward_change_std": 0.2511585932224989, + "reward_std": 0.5656109638512135, + "rewards/accuracy_reward": 0.354166679084301, + "rewards/cosine_scaled_reward": 0.1135842353105545, + "step": 25 + }, + { + "clip_fraction": 0.0, + "completion_length": 2473.708366394043, + "epoch": 0.029714285714285714, + "grad_norm": 0.02305966429412365, + "kl": 0.00014957785606384277, + "lambda_div_used": 0.57183438539505, + "learning_rate": 9.982876141412855e-07, + "loss": -0.0358, + "reward": -0.41022508684545755, + "reward_after_mean": -0.41022508684545755, + "reward_after_std": 0.3927479684352875, + "reward_before_mean": -0.15500983409583569, + "reward_before_std": 0.32340476755052805, + "reward_change_max": 0.0, + "reward_change_mean": -0.25521524623036385, + "reward_change_min": -0.4159896522760391, + "reward_change_std": 0.1455942215397954, + "reward_std": 0.3927479758858681, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/cosine_scaled_reward": -0.21750983409583569, + "step": 26 + }, + { + "clip_fraction": 0.0, + "completion_length": 2344.5625762939453, + "epoch": 0.030857142857142857, + "grad_norm": 0.03154224529862404, + "kl": 0.00014576315879821777, + "lambda_div_used": 0.5766249001026154, + "learning_rate": 9.975348529157229e-07, + "loss": 0.1289, + "reward": -0.04079665243625641, + "reward_after_mean": -0.04079665243625641, + "reward_after_std": 0.4307698383927345, + "reward_before_mean": 0.39524078369140625, + "reward_before_std": 0.3457355350255966, + "reward_change_max": 0.0, + "reward_change_mean": -0.4360374417155981, + "reward_change_min": -0.6437131129205227, + "reward_change_std": 0.24637807440012693, + "reward_std": 0.4307698402553797, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/cosine_scaled_reward": 0.061907440423965454, + "step": 27 + }, + { + "clip_fraction": 0.0, + "completion_length": 2371.4375381469727, + "epoch": 0.032, + "grad_norm": 0.030750228092074394, + "kl": 0.00011971592903137207, + "lambda_div_used": 0.6046403273940086, + "learning_rate": 9.96645768238595e-07, + "loss": 0.0725, + "reward": 0.06598741095513105, + "reward_after_mean": 0.06598741095513105, + "reward_after_std": 0.6246800310909748, + "reward_before_mean": 0.5469000339508057, + "reward_before_std": 0.47788948379456997, + "reward_change_max": 0.0, + "reward_change_mean": -0.4809126127511263, + "reward_change_min": -0.6964126750826836, + "reward_change_std": 0.2671422157436609, + "reward_std": 0.6246800404042006, + "rewards/accuracy_reward": 0.4375000074505806, + "rewards/cosine_scaled_reward": 0.10940003173891455, + "step": 28 + }, + { + "clip_fraction": 0.0, + "completion_length": 2796.6250762939453, + "epoch": 0.03314285714285714, + "grad_norm": 0.022529419511556625, + "kl": 0.00014966726303100586, + "lambda_div_used": 0.5826017782092094, + "learning_rate": 9.956206309337066e-07, + "loss": -0.0443, + "reward": -0.2186606228351593, + "reward_after_mean": -0.2186606228351593, + "reward_after_std": 0.4133179672062397, + "reward_before_mean": 0.09334492683410645, + "reward_before_std": 0.3791744504123926, + "reward_change_max": 0.0, + "reward_change_mean": -0.3120055440813303, + "reward_change_min": -0.4988309144973755, + "reward_change_std": 0.19146351423114538, + "reward_std": 0.41331798397004604, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/cosine_scaled_reward": -0.07332174107432365, + "step": 29 + }, + { + "clip_fraction": 0.0, + "completion_length": 2330.354232788086, + "epoch": 0.03428571428571429, + "grad_norm": 0.02580900862812996, + "kl": 0.00011408329010009766, + "lambda_div_used": 0.617066040635109, + "learning_rate": 9.944597532678119e-07, + "loss": 0.0106, + "reward": -0.01328302314504981, + "reward_after_mean": -0.01328302314504981, + "reward_after_std": 0.6155566833913326, + "reward_before_mean": 0.36568982464814326, + "reward_before_std": 0.536849819123745, + "reward_change_max": 0.0, + "reward_change_mean": -0.37897284515202045, + "reward_change_min": -0.5798661820590496, + "reward_change_std": 0.22223789989948273, + "reward_std": 0.6155567076057196, + "rewards/accuracy_reward": 0.3125000074505806, + "rewards/cosine_scaled_reward": 0.05318982107564807, + "step": 30 + }, + { + "clip_fraction": 0.0, + "completion_length": 2794.4375381469727, + "epoch": 0.03542857142857143, + "grad_norm": 0.021817587316036224, + "kl": 0.0001348257064819336, + "lambda_div_used": 0.6117052882909775, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0028, + "reward": -0.07263503596186638, + "reward_after_mean": -0.07263503596186638, + "reward_after_std": 0.5405435804277658, + "reward_before_mean": 0.26847894000820816, + "reward_before_std": 0.5099836494773626, + "reward_change_max": 0.0, + "reward_change_mean": -0.34111399203538895, + "reward_change_min": -0.5489708594977856, + "reward_change_std": 0.21398558467626572, + "reward_std": 0.5405435990542173, + "rewards/accuracy_reward": 0.27083334140479565, + "rewards/cosine_scaled_reward": -0.002354402095079422, + "step": 31 + }, + { + "clip_fraction": 0.0, + "completion_length": 2270.2292098999023, + "epoch": 0.036571428571428574, + "grad_norm": 0.024800026789307594, + "kl": 0.00010867416858673096, + "lambda_div_used": 0.6270024925470352, + "learning_rate": 9.917322325514487e-07, + "loss": 0.0215, + "reward": 0.13659005239605904, + "reward_after_mean": 0.13659005239605904, + "reward_after_std": 0.6418719291687012, + "reward_before_mean": 0.5631819479167461, + "reward_before_std": 0.5894247069954872, + "reward_change_max": 0.0, + "reward_change_mean": -0.4265918843448162, + "reward_change_min": -0.6561249867081642, + "reward_change_std": 0.26332158874720335, + "reward_std": 0.6418719589710236, + "rewards/accuracy_reward": 0.39583334140479565, + "rewards/cosine_scaled_reward": 0.1673485841602087, + "step": 32 + }, + { + "clip_fraction": 0.0, + "completion_length": 2852.000045776367, + "epoch": 0.037714285714285714, + "grad_norm": 0.025015637278556824, + "kl": 0.00012034177780151367, + "lambda_div_used": 0.6343094930052757, + "learning_rate": 9.901664203302124e-07, + "loss": -0.0682, + "reward": 0.12994618620723486, + "reward_after_mean": 0.12994618620723486, + "reward_after_std": 0.641582889482379, + "reward_before_mean": 0.5249918717890978, + "reward_before_std": 0.6226585754193366, + "reward_change_max": 0.0, + "reward_change_mean": -0.39504568465054035, + "reward_change_min": -0.6912417784333229, + "reward_change_std": 0.2625753004103899, + "reward_std": 0.6415829043835402, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/cosine_scaled_reward": 0.12915852759033442, + "step": 33 + }, + { + "clip_fraction": 0.0, + "completion_length": 1936.0000305175781, + "epoch": 0.038857142857142854, + "grad_norm": 0.027393249794840813, + "kl": 0.0001316368579864502, + "lambda_div_used": 0.6432492136955261, + "learning_rate": 9.88466529153356e-07, + "loss": 0.0524, + "reward": 0.22461825609207153, + "reward_after_mean": 0.22461825609207153, + "reward_after_std": 0.6486394293606281, + "reward_before_mean": 0.6393125429749489, + "reward_before_std": 0.6651058997958899, + "reward_change_max": 0.0, + "reward_change_mean": -0.41469427943229675, + "reward_change_min": -0.6961428225040436, + "reward_change_std": 0.27915553003549576, + "reward_std": 0.6486394479870796, + "rewards/accuracy_reward": 0.416666679084301, + "rewards/cosine_scaled_reward": 0.222645852714777, + "step": 34 + }, + { + "clip_fraction": 0.0, + "completion_length": 2444.270851135254, + "epoch": 0.04, + "grad_norm": 0.036447569727897644, + "kl": 0.0001233518123626709, + "lambda_div_used": 0.641115739941597, + "learning_rate": 9.866330768241983e-07, + "loss": 0.0574, + "reward": 0.17524952441453934, + "reward_after_mean": 0.17524952441453934, + "reward_after_std": 0.6338076200336218, + "reward_before_mean": 0.5732492320239544, + "reward_before_std": 0.6535743195563555, + "reward_change_max": 0.0, + "reward_change_mean": -0.39799970760941505, + "reward_change_min": -0.6731353290379047, + "reward_change_std": 0.2706009875983, + "reward_std": 0.6338076237589121, + "rewards/accuracy_reward": 0.416666679084301, + "rewards/cosine_scaled_reward": 0.15658256597816944, + "step": 35 + }, + { + "clip_fraction": 0.0, + "completion_length": 3021.5834045410156, + "epoch": 0.04114285714285714, + "grad_norm": 0.023824643343687057, + "kl": 0.00018006563186645508, + "lambda_div_used": 0.6088642552495003, + "learning_rate": 9.846666218300807e-07, + "loss": -0.0045, + "reward": -0.21179450303316116, + "reward_after_mean": -0.21179450303316116, + "reward_after_std": 0.521540641784668, + "reward_before_mean": 0.07177379354834557, + "reward_before_std": 0.5020219217985868, + "reward_change_max": 0.0, + "reward_change_mean": -0.2835683096200228, + "reward_change_min": -0.5416868068277836, + "reward_change_std": 0.1942979209125042, + "reward_std": 0.5215406529605389, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/cosine_scaled_reward": -0.07405953668057919, + "step": 36 + }, + { + "clip_fraction": 0.0, + "completion_length": 2800.3541717529297, + "epoch": 0.04228571428571429, + "grad_norm": 0.022017668932676315, + "kl": 0.0001254826784133911, + "lambda_div_used": 0.5779423043131828, + "learning_rate": 9.825677631722435e-07, + "loss": 0.0116, + "reward": -0.2564888745546341, + "reward_after_mean": -0.2564888745546341, + "reward_after_std": 0.4095242340117693, + "reward_before_mean": 0.04728756472468376, + "reward_before_std": 0.3590739220380783, + "reward_change_max": 0.0, + "reward_change_mean": -0.3037764262408018, + "reward_change_min": -0.48755551874637604, + "reward_change_std": 0.18500223569571972, + "reward_std": 0.40952424332499504, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/cosine_scaled_reward": -0.09854578226804733, + "step": 37 + }, + { + "clip_fraction": 0.0, + "completion_length": 3207.8541870117188, + "epoch": 0.04342857142857143, + "grad_norm": 0.01808248646557331, + "kl": 0.00016523152589797974, + "lambda_div_used": 0.5784263759851456, + "learning_rate": 9.80337140183366e-07, + "loss": 0.0282, + "reward": -0.26364604104310274, + "reward_after_mean": -0.26364604104310274, + "reward_after_std": 0.40342688001692295, + "reward_before_mean": 0.05271115526556969, + "reward_before_std": 0.35847953893244267, + "reward_change_max": 0.0, + "reward_change_mean": -0.31635717302560806, + "reward_change_min": -0.5081874057650566, + "reward_change_std": 0.19082189723849297, + "reward_std": 0.4034268856048584, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/cosine_scaled_reward": -0.0931221836945042, + "step": 38 + }, + { + "clip_fraction": 0.0, + "completion_length": 2402.250045776367, + "epoch": 0.044571428571428574, + "grad_norm": 0.02232450246810913, + "kl": 0.00010313093662261963, + "lambda_div_used": 0.594361886382103, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0018, + "reward": -0.1193324881605804, + "reward_after_mean": -0.1193324881605804, + "reward_after_std": 0.5286196451634169, + "reward_before_mean": 0.25862734392285347, + "reward_before_std": 0.4300632723607123, + "reward_change_max": 0.0, + "reward_change_mean": -0.37795985862612724, + "reward_change_min": -0.5456654913723469, + "reward_change_std": 0.21102675329893827, + "reward_std": 0.5286196675151587, + "rewards/accuracy_reward": 0.2708333395421505, + "rewards/cosine_scaled_reward": -0.01220599515363574, + "step": 39 + }, + { + "clip_fraction": 0.0, + "completion_length": 2159.604217529297, + "epoch": 0.045714285714285714, + "grad_norm": 0.02471066638827324, + "kl": 0.00011989474296569824, + "lambda_div_used": 0.5991866067051888, + "learning_rate": 9.754833590196926e-07, + "loss": 0.0737, + "reward": 0.01806516945362091, + "reward_after_mean": 0.01806516945362091, + "reward_after_std": 0.5395061280578375, + "reward_before_mean": 0.45469519402831793, + "reward_before_std": 0.4553617415949702, + "reward_change_max": 0.0, + "reward_change_mean": -0.43663003854453564, + "reward_change_min": -0.6850062496960163, + "reward_change_std": 0.2616432458162308, + "reward_std": 0.5395061578601599, + "rewards/accuracy_reward": 0.3333333395421505, + "rewards/cosine_scaled_reward": 0.12136187124997377, + "step": 40 + }, + { + "clip_fraction": 0.0, + "completion_length": 2768.5000610351562, + "epoch": 0.046857142857142854, + "grad_norm": 0.021616969257593155, + "kl": 0.00012002140283584595, + "lambda_div_used": 0.6239832416176796, + "learning_rate": 9.728616793536587e-07, + "loss": -0.0156, + "reward": 0.14593233913183212, + "reward_after_mean": 0.14593233913183212, + "reward_after_std": 0.63340456597507, + "reward_before_mean": 0.5886982697993517, + "reward_before_std": 0.5802208222448826, + "reward_change_max": 0.0, + "reward_change_mean": -0.442765936255455, + "reward_change_min": -0.7020618245005608, + "reward_change_std": 0.2809063671156764, + "reward_std": 0.6334045827388763, + "rewards/accuracy_reward": 0.4375000111758709, + "rewards/cosine_scaled_reward": 0.1511982548981905, + "step": 41 + }, + { + "clip_fraction": 0.0, + "completion_length": 2634.8333702087402, + "epoch": 0.048, + "grad_norm": 0.04166898876428604, + "kl": 0.00016657263040542603, + "lambda_div_used": 0.5780549123883247, + "learning_rate": 9.701111919237408e-07, + "loss": 0.0133, + "reward": -0.34821823611855507, + "reward_after_mean": -0.34821823611855507, + "reward_after_std": 0.42264553159475327, + "reward_before_mean": -0.07792945206165314, + "reward_before_std": 0.3588131470605731, + "reward_change_max": 0.0, + "reward_change_mean": -0.2702887710183859, + "reward_change_min": -0.42083460837602615, + "reward_change_std": 0.15945285465568304, + "reward_std": 0.42264554649591446, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/cosine_scaled_reward": -0.16126279765740037, + "step": 42 + }, + { + "clip_fraction": 0.0, + "completion_length": 2635.833396911621, + "epoch": 0.04914285714285714, + "grad_norm": 0.02200184389948845, + "kl": 0.0001204218715429306, + "lambda_div_used": 0.6316910237073898, + "learning_rate": 9.672327345550543e-07, + "loss": -0.0063, + "reward": 0.021529126912355423, + "reward_after_mean": 0.021529126912355423, + "reward_after_std": 0.7083056271076202, + "reward_before_mean": 0.3985202740877867, + "reward_before_std": 0.6068296208977699, + "reward_change_max": 0.0, + "reward_change_mean": -0.3769911602139473, + "reward_change_min": -0.5825657024979591, + "reward_change_std": 0.2227043965831399, + "reward_std": 0.7083056569099426, + "rewards/accuracy_reward": 0.31250000558793545, + "rewards/cosine_scaled_reward": 0.08602028086897917, + "step": 43 + }, + { + "clip_fraction": 0.0, + "completion_length": 2195.229248046875, + "epoch": 0.05028571428571429, + "grad_norm": 0.036139003932476044, + "kl": 0.00012126564979553223, + "lambda_div_used": 0.5746868774294853, + "learning_rate": 9.64227184053598e-07, + "loss": 0.1008, + "reward": -0.02278389036655426, + "reward_after_mean": -0.02278389036655426, + "reward_after_std": 0.43771820329129696, + "reward_before_mean": 0.4344022050499916, + "reward_before_std": 0.33903054893016815, + "reward_change_max": 0.0, + "reward_change_mean": -0.457186084240675, + "reward_change_min": -0.6404859870672226, + "reward_change_std": 0.257523151114583, + "reward_std": 0.43771822564303875, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/cosine_scaled_reward": 0.10106886364519596, + "step": 44 + }, + { + "clip_fraction": 0.0, + "completion_length": 3065.2083740234375, + "epoch": 0.05142857142857143, + "grad_norm": 0.01725860871374607, + "kl": 0.0001525580883026123, + "lambda_div_used": 0.6148836985230446, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0504, + "reward": 0.07026529498398304, + "reward_after_mean": 0.07026529498398304, + "reward_after_std": 0.5995844416320324, + "reward_before_mean": 0.49401637725532055, + "reward_before_std": 0.5296608861535788, + "reward_change_max": 0.0, + "reward_change_mean": -0.42375105805695057, + "reward_change_min": -0.6395902335643768, + "reward_change_std": 0.25529387686401606, + "reward_std": 0.599584462121129, + "rewards/accuracy_reward": 0.37500000931322575, + "rewards/cosine_scaled_reward": 0.11901635373942554, + "step": 45 + }, + { + "clip_fraction": 0.0, + "completion_length": 2750.166702270508, + "epoch": 0.052571428571428575, + "grad_norm": 0.024388441815972328, + "kl": 0.0001780986785888672, + "lambda_div_used": 0.5804363936185837, + "learning_rate": 9.578385041664925e-07, + "loss": 0.045, + "reward": -0.34557132300687954, + "reward_after_mean": -0.34557132300687954, + "reward_after_std": 0.41662513464689255, + "reward_before_mean": -0.07300508208572865, + "reward_before_std": 0.3654432473704219, + "reward_change_max": 0.0, + "reward_change_mean": -0.2725662402808666, + "reward_change_min": -0.43397457897663116, + "reward_change_std": 0.16234493535012007, + "reward_std": 0.4166251439601183, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/cosine_scaled_reward": -0.15633842581883073, + "step": 46 + }, + { + "clip_fraction": 0.0, + "completion_length": 2203.437568664551, + "epoch": 0.053714285714285714, + "grad_norm": 0.02950853668153286, + "kl": 0.00010627508163452148, + "lambda_div_used": 0.6182287782430649, + "learning_rate": 9.54457320834625e-07, + "loss": -0.0342, + "reward": 0.08941356465220451, + "reward_after_mean": 0.08941356465220451, + "reward_after_std": 0.5494941845536232, + "reward_before_mean": 0.48168421536684036, + "reward_before_std": 0.5449010655283928, + "reward_change_max": 0.0, + "reward_change_mean": -0.39227062091231346, + "reward_change_min": -0.6192042604088783, + "reward_change_std": 0.2517077624797821, + "reward_std": 0.5494942031800747, + "rewards/accuracy_reward": 0.354166679084301, + "rewards/cosine_scaled_reward": 0.12751753628253937, + "step": 47 + }, + { + "clip_fraction": 0.0, + "completion_length": 2591.729232788086, + "epoch": 0.054857142857142854, + "grad_norm": 0.026953846216201782, + "kl": 0.00012552738189697266, + "lambda_div_used": 0.6436027362942696, + "learning_rate": 9.509529358847654e-07, + "loss": 0.0003, + "reward": 0.07579808123409748, + "reward_after_mean": 0.07579808123409748, + "reward_after_std": 0.7408107332885265, + "reward_before_mean": 0.4633368235081434, + "reward_before_std": 0.6680604638531804, + "reward_change_max": 0.0, + "reward_change_mean": -0.38753873854875565, + "reward_change_min": -0.686689231544733, + "reward_change_std": 0.25357643235474825, + "reward_std": 0.740810751914978, + "rewards/accuracy_reward": 0.33333333767950535, + "rewards/cosine_scaled_reward": 0.130003463011235, + "step": 48 + }, + { + "clip_fraction": 0.0, + "completion_length": 1606.1875267028809, + "epoch": 0.056, + "grad_norm": 0.03145647421479225, + "kl": 8.93324613571167e-05, + "lambda_div_used": 0.6448555663228035, + "learning_rate": 9.473264167865171e-07, + "loss": -0.0559, + "reward": 0.11411497555673122, + "reward_after_mean": 0.11411497555673122, + "reward_after_std": 0.6693379506468773, + "reward_before_mean": 0.456937775015831, + "reward_before_std": 0.679039599490352, + "reward_change_max": 0.0, + "reward_change_mean": -0.3428228013217449, + "reward_change_min": -0.5654460191726685, + "reward_change_std": 0.23543909844011068, + "reward_std": 0.6693379702046514, + "rewards/accuracy_reward": 0.35416667722165585, + "rewards/cosine_scaled_reward": 0.10277110431343317, + "step": 49 + }, + { + "clip_fraction": 0.0, + "completion_length": 2721.18754196167, + "epoch": 0.05714285714285714, + "grad_norm": 0.020998205989599228, + "kl": 0.00010278820991516113, + "lambda_div_used": 0.5569720417261124, + "learning_rate": 9.43578868212728e-07, + "loss": 0.0198, + "reward": -0.014975886791944504, + "reward_after_mean": -0.014975886791944504, + "reward_after_std": 0.47250125743448734, + "reward_before_mean": 0.5332869850099087, + "reward_before_std": 0.25590797886252403, + "reward_change_max": 0.0, + "reward_change_mean": -0.5482628662139177, + "reward_change_min": -0.7288287468254566, + "reward_change_std": 0.2796243606135249, + "reward_std": 0.4725012853741646, + "rewards/accuracy_reward": 0.39583333395421505, + "rewards/cosine_scaled_reward": 0.13745362346526235, + "step": 50 + }, + { + "clip_fraction": 0.0, + "completion_length": 2238.145851135254, + "epoch": 0.05828571428571429, + "grad_norm": 0.030202677473425865, + "kl": 0.00016352534294128418, + "lambda_div_used": 0.589074470102787, + "learning_rate": 9.397114317029974e-07, + "loss": -0.0263, + "reward": -0.32303538359701633, + "reward_after_mean": -0.32303538359701633, + "reward_after_std": 0.4511607848107815, + "reward_before_mean": -0.059135761111974716, + "reward_before_std": 0.41417009476572275, + "reward_change_max": 0.0, + "reward_change_mean": -0.26389962807297707, + "reward_change_min": -0.46818122640252113, + "reward_change_std": 0.1724827392026782, + "reward_std": 0.4511608015745878, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/cosine_scaled_reward": -0.16330242389813066, + "step": 51 + }, + { + "clip_fraction": 0.0, + "completion_length": 2562.2292098999023, + "epoch": 0.05942857142857143, + "grad_norm": 0.027546504512429237, + "kl": 0.00012978166341781616, + "lambda_div_used": 0.5993086248636246, + "learning_rate": 9.357252853159505e-07, + "loss": 0.0385, + "reward": 0.08331240899860859, + "reward_after_mean": 0.08331240899860859, + "reward_after_std": 0.5890753846615553, + "reward_before_mean": 0.5538598063867539, + "reward_before_std": 0.45862336084246635, + "reward_change_max": 0.0, + "reward_change_mean": -0.4705474264919758, + "reward_change_min": -0.6809027269482613, + "reward_change_std": 0.26692016143351793, + "reward_std": 0.5890753846615553, + "rewards/accuracy_reward": 0.43750000558793545, + "rewards/cosine_scaled_reward": 0.11635981127619743, + "step": 52 + }, + { + "clip_fraction": 0.0, + "completion_length": 2494.8750762939453, + "epoch": 0.060571428571428575, + "grad_norm": 0.024038787931203842, + "kl": 0.0001424252986907959, + "lambda_div_used": 0.6427036076784134, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0239, + "reward": 0.016640717163681984, + "reward_after_mean": 0.016640717163681984, + "reward_after_std": 0.6831068731844425, + "reward_before_mean": 0.334790101274848, + "reward_before_std": 0.665732966735959, + "reward_change_max": 0.0, + "reward_change_mean": -0.318149384111166, + "reward_change_min": -0.5409456379711628, + "reward_change_std": 0.2164039220660925, + "reward_std": 0.6831068824976683, + "rewards/accuracy_reward": 0.29166667349636555, + "rewards/cosine_scaled_reward": 0.043123436626046896, + "step": 53 + }, + { + "clip_fraction": 0.0, + "completion_length": 1934.7083892822266, + "epoch": 0.061714285714285715, + "grad_norm": 0.0318799689412117, + "kl": 9.445101022720337e-05, + "lambda_div_used": 0.6365627199411392, + "learning_rate": 9.274017555754407e-07, + "loss": 0.0748, + "reward": 0.47428043745458126, + "reward_after_mean": 0.47428043745458126, + "reward_after_std": 0.7259879875928164, + "reward_before_mean": 1.077655490487814, + "reward_before_std": 0.6342989937402308, + "reward_change_max": 0.0, + "reward_change_mean": -0.6033750809729099, + "reward_change_min": -0.9116491675376892, + "reward_change_std": 0.3683675564825535, + "reward_std": 0.7259880118072033, + "rewards/accuracy_reward": 0.6458333469927311, + "rewards/cosine_scaled_reward": 0.43182216165587306, + "step": 54 + }, + { + "clip_fraction": 0.0, + "completion_length": 2623.7708892822266, + "epoch": 0.06285714285714286, + "grad_norm": 0.02094973810017109, + "kl": 0.00013570114970207214, + "lambda_div_used": 0.645888201892376, + "learning_rate": 9.230669076497687e-07, + "loss": 0.0808, + "reward": 0.17671905946917832, + "reward_after_mean": 0.17671905946917832, + "reward_after_std": 0.7367929276078939, + "reward_before_mean": 0.5884007401764393, + "reward_before_std": 0.6809105025604367, + "reward_change_max": 0.0, + "reward_change_mean": -0.4116816818714142, + "reward_change_min": -0.6507021151483059, + "reward_change_std": 0.258549933321774, + "reward_std": 0.7367929276078939, + "rewards/accuracy_reward": 0.3958333395421505, + "rewards/cosine_scaled_reward": 0.19256740622222424, + "step": 55 + }, + { + "clip_fraction": 0.0, + "completion_length": 2702.8750915527344, + "epoch": 0.064, + "grad_norm": 0.02438957802951336, + "kl": 0.00014606118202209473, + "lambda_div_used": 0.583276279270649, + "learning_rate": 9.186184199300463e-07, + "loss": 0.0126, + "reward": -0.2818741099908948, + "reward_after_mean": -0.2818741099908948, + "reward_after_std": 0.4212169963866472, + "reward_before_mean": 0.015760678332298994, + "reward_before_std": 0.38252383656799793, + "reward_change_max": 0.0, + "reward_change_mean": -0.2976347878575325, + "reward_change_min": -0.5124507918953896, + "reward_change_std": 0.1885841079056263, + "reward_std": 0.4212170038372278, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/cosine_scaled_reward": -0.1092393291182816, + "step": 56 + }, + { + "clip_fraction": 0.0, + "completion_length": 3056.4583740234375, + "epoch": 0.06514285714285714, + "grad_norm": 0.01594528928399086, + "kl": 0.00010664761066436768, + "lambda_div_used": 0.6316090971231461, + "learning_rate": 9.140576474687263e-07, + "loss": 0.0184, + "reward": -0.039651480969041586, + "reward_after_mean": -0.039651480969041586, + "reward_after_std": 0.6351467221975327, + "reward_before_mean": 0.2791806310415268, + "reward_before_std": 0.605388393625617, + "reward_change_max": 0.0, + "reward_change_mean": -0.3188321180641651, + "reward_change_min": -0.5200116373598576, + "reward_change_std": 0.20627015084028244, + "reward_std": 0.6351467464119196, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/cosine_scaled_reward": 0.008347294380655512, + "step": 57 + }, + { + "clip_fraction": 0.0, + "completion_length": 1619.8541946411133, + "epoch": 0.06628571428571428, + "grad_norm": 0.028398146852850914, + "kl": 7.768720388412476e-05, + "lambda_div_used": 0.6244253218173981, + "learning_rate": 9.093859795212817e-07, + "loss": -0.0744, + "reward": 0.02804100140929222, + "reward_after_mean": 0.02804100140929222, + "reward_after_std": 0.6312676724046469, + "reward_before_mean": 0.41605534171685576, + "reward_before_std": 0.5784196928143501, + "reward_change_max": 0.0, + "reward_change_mean": -0.38801432587206364, + "reward_change_min": -0.6227842308580875, + "reward_change_std": 0.24862205237150192, + "reward_std": 0.6312677096575499, + "rewards/accuracy_reward": 0.35416666977107525, + "rewards/cosine_scaled_reward": 0.061888658441603184, + "step": 58 + }, + { + "clip_fraction": 0.0, + "completion_length": 2521.937530517578, + "epoch": 0.06742857142857143, + "grad_norm": 0.023369140923023224, + "kl": 9.158626198768616e-05, + "lambda_div_used": 0.6007750853896141, + "learning_rate": 9.046048391230247e-07, + "loss": 0.0054, + "reward": -0.07107383571565151, + "reward_after_mean": -0.07107383571565151, + "reward_after_std": 0.49415648356080055, + "reward_before_mean": 0.2814117716625333, + "reward_before_std": 0.466181633528322, + "reward_change_max": 0.0, + "reward_change_mean": -0.3524855989962816, + "reward_change_min": -0.5633360184729099, + "reward_change_std": 0.22167869098484516, + "reward_std": 0.49415648356080055, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/cosine_scaled_reward": 0.03141175117343664, + "step": 59 + }, + { + "clip_fraction": 0.0, + "completion_length": 2424.3125762939453, + "epoch": 0.06857142857142857, + "grad_norm": 0.020705759525299072, + "kl": 0.00011293590068817139, + "lambda_div_used": 0.6067081466317177, + "learning_rate": 8.997156826556369e-07, + "loss": 0.0407, + "reward": 0.07130313850939274, + "reward_after_mean": 0.07130313850939274, + "reward_after_std": 0.5420917756855488, + "reward_before_mean": 0.5138208344578743, + "reward_before_std": 0.4931760486215353, + "reward_change_max": 0.0, + "reward_change_mean": -0.4425176791846752, + "reward_change_min": -0.6790216974914074, + "reward_change_std": 0.2717863190919161, + "reward_std": 0.5420917868614197, + "rewards/accuracy_reward": 0.3750000074505806, + "rewards/cosine_scaled_reward": 0.1388207976706326, + "step": 60 + }, + { + "clip_fraction": 0.0, + "completion_length": 2410.625030517578, + "epoch": 0.06971428571428571, + "grad_norm": 0.0203632153570652, + "kl": 9.970366954803467e-05, + "lambda_div_used": 0.5674436464905739, + "learning_rate": 8.9471999940354e-07, + "loss": 0.0006, + "reward": -0.1596299186348915, + "reward_after_mean": -0.1596299186348915, + "reward_after_std": 0.40844789519906044, + "reward_before_mean": 0.2439738381654024, + "reward_before_std": 0.3030575467273593, + "reward_change_max": 0.0, + "reward_change_mean": -0.40360378473997116, + "reward_change_min": -0.562778364866972, + "reward_change_std": 0.21779226139187813, + "reward_std": 0.4084479194134474, + "rewards/accuracy_reward": 0.25000000558793545, + "rewards/cosine_scaled_reward": -0.006026154384016991, + "step": 61 + }, + { + "clip_fraction": 0.0, + "completion_length": 1986.6875381469727, + "epoch": 0.07085714285714285, + "grad_norm": 0.023419808596372604, + "kl": 8.377432823181152e-05, + "lambda_div_used": 0.6294708475470543, + "learning_rate": 8.896193111002475e-07, + "loss": 0.0145, + "reward": 0.127364382147789, + "reward_after_mean": 0.127364382147789, + "reward_after_std": 0.6607769038528204, + "reward_before_mean": 0.5438444633036852, + "reward_before_std": 0.6062875427305698, + "reward_change_max": 0.0, + "reward_change_mean": -0.41648009419441223, + "reward_change_min": -0.6949719563126564, + "reward_change_std": 0.27056892681866884, + "reward_std": 0.6607769187539816, + "rewards/accuracy_reward": 0.4166666753590107, + "rewards/cosine_scaled_reward": 0.12717779609374702, + "step": 62 + }, + { + "clip_fraction": 0.0, + "completion_length": 1558.8542098999023, + "epoch": 0.072, + "grad_norm": 0.029614871367812157, + "kl": 9.24495980143547e-05, + "lambda_div_used": 0.5882120281457901, + "learning_rate": 8.844151714648274e-07, + "loss": -0.0217, + "reward": 0.24260340631008148, + "reward_after_mean": 0.24260340631008148, + "reward_after_std": 0.5563407000154257, + "reward_before_mean": 0.8341647423803806, + "reward_before_std": 0.4078605566173792, + "reward_change_max": 0.0, + "reward_change_mean": -0.59156134724617, + "reward_change_min": -0.8355859033763409, + "reward_change_std": 0.33061067573726177, + "reward_std": 0.5563407260924578, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/cosine_scaled_reward": 0.29249807819724083, + "step": 63 + }, + { + "clip_fraction": 0.0, + "completion_length": 2526.979217529297, + "epoch": 0.07314285714285715, + "grad_norm": 0.023758206516504288, + "kl": 0.00014294683933258057, + "lambda_div_used": 0.5706712529063225, + "learning_rate": 8.791091657286267e-07, + "loss": -0.0491, + "reward": 0.005135258659720421, + "reward_after_mean": 0.005135258659720421, + "reward_after_std": 0.4730408936738968, + "reward_before_mean": 0.509556919336319, + "reward_before_std": 0.318298134021461, + "reward_change_max": 0.0, + "reward_change_mean": -0.5044216811656952, + "reward_change_min": -0.6950805820524693, + "reward_change_std": 0.2665313957259059, + "reward_std": 0.4730409197509289, + "rewards/accuracy_reward": 0.37500000558793545, + "rewards/cosine_scaled_reward": 0.13455691374838352, + "step": 64 + }, + { + "clip_fraction": 0.0, + "completion_length": 2619.6875343322754, + "epoch": 0.07428571428571429, + "grad_norm": 0.02972961962223053, + "kl": 9.997934103012085e-05, + "lambda_div_used": 0.5918664485216141, + "learning_rate": 8.737029101523929e-07, + "loss": 0.0061, + "reward": -0.009068667888641357, + "reward_after_mean": -0.009068667888641357, + "reward_after_std": 0.523827837780118, + "reward_before_mean": 0.42262596264481544, + "reward_before_std": 0.4240496205165982, + "reward_change_max": 0.0, + "reward_change_mean": -0.43169461004436016, + "reward_change_min": -0.6695713810622692, + "reward_change_std": 0.25297324638813734, + "reward_std": 0.5238278452306986, + "rewards/accuracy_reward": 0.3541666679084301, + "rewards/cosine_scaled_reward": 0.06845926493406296, + "step": 65 + }, + { + "clip_fraction": 0.0, + "completion_length": 2048.645835876465, + "epoch": 0.07542857142857143, + "grad_norm": 0.029114792123436928, + "kl": 8.37370753288269e-05, + "lambda_div_used": 0.5725493803620338, + "learning_rate": 8.681980515339463e-07, + "loss": -0.0155, + "reward": -0.16004172409884632, + "reward_after_mean": -0.16004172409884632, + "reward_after_std": 0.49360031075775623, + "reward_before_mean": 0.2602699510753155, + "reward_before_std": 0.33009787695482373, + "reward_change_max": 0.0, + "reward_change_mean": -0.4203116577118635, + "reward_change_min": -0.6120298802852631, + "reward_change_std": 0.2290408704429865, + "reward_std": 0.49360031820833683, + "rewards/accuracy_reward": 0.31250000186264515, + "rewards/cosine_scaled_reward": -0.05223005823791027, + "step": 66 + }, + { + "clip_fraction": 0.0, + "completion_length": 3108.1458740234375, + "epoch": 0.07657142857142857, + "grad_norm": 0.017497915774583817, + "kl": 0.00010813027620315552, + "lambda_div_used": 0.630346029996872, + "learning_rate": 8.625962667065487e-07, + "loss": -0.024, + "reward": -0.17341885343194008, + "reward_after_mean": -0.17341885343194008, + "reward_after_std": 0.6313954871147871, + "reward_before_mean": 0.0926114417379722, + "reward_before_std": 0.6103347176685929, + "reward_change_max": 0.0, + "reward_change_mean": -0.26603029295802116, + "reward_change_min": -0.5225372426211834, + "reward_change_std": 0.1893094191327691, + "reward_std": 0.6313955169171095, + "rewards/accuracy_reward": 0.16666666977107525, + "rewards/cosine_scaled_reward": -0.07405522745102644, + "step": 67 + }, + { + "clip_fraction": 0.0, + "completion_length": 1551.3541870117188, + "epoch": 0.07771428571428571, + "grad_norm": 0.03526793047785759, + "kl": 9.113550186157227e-05, + "lambda_div_used": 0.6158603206276894, + "learning_rate": 8.568992620281243e-07, + "loss": -0.07, + "reward": -0.04862045869231224, + "reward_after_mean": -0.04862045869231224, + "reward_after_std": 0.5381349269300699, + "reward_before_mean": 0.3049982152879238, + "reward_before_std": 0.5310354437679052, + "reward_change_max": 0.0, + "reward_change_mean": -0.35361868515610695, + "reward_change_min": -0.6167616136372089, + "reward_change_std": 0.2329900823533535, + "reward_std": 0.5381349604576826, + "rewards/accuracy_reward": 0.291666679084301, + "rewards/cosine_scaled_reward": 0.013331551104784012, + "step": 68 + }, + { + "clip_fraction": 0.0, + "completion_length": 1843.0000305175781, + "epoch": 0.07885714285714286, + "grad_norm": 0.034535001963377, + "kl": 0.00010597705841064453, + "lambda_div_used": 0.6196342781186104, + "learning_rate": 8.511087728614862e-07, + "loss": -0.0403, + "reward": -0.16936753690242767, + "reward_after_mean": -0.16936753690242767, + "reward_after_std": 0.5928534604609013, + "reward_before_mean": 0.10612463857978582, + "reward_before_std": 0.5493131745606661, + "reward_change_max": 0.0, + "reward_change_mean": -0.2754921726882458, + "reward_change_min": -0.4671623595058918, + "reward_change_std": 0.1736066685989499, + "reward_std": 0.592853469774127, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/cosine_scaled_reward": -0.06054202886298299, + "step": 69 + }, + { + "clip_fraction": 0.0, + "completion_length": 2385.6250610351562, + "epoch": 0.08, + "grad_norm": 0.022255579009652138, + "kl": 9.971857070922852e-05, + "lambda_div_used": 0.5713987499475479, + "learning_rate": 8.452265630457282e-07, + "loss": 0.0293, + "reward": -0.15412342175841331, + "reward_after_mean": -0.15412342175841331, + "reward_after_std": 0.4452939387410879, + "reward_before_mean": 0.2508242540061474, + "reward_before_std": 0.32261871080845594, + "reward_change_max": 0.0, + "reward_change_mean": -0.4049476757645607, + "reward_change_min": -0.5730207115411758, + "reward_change_std": 0.21870618779212236, + "reward_std": 0.44529395177960396, + "rewards/accuracy_reward": 0.25000000558793545, + "rewards/cosine_scaled_reward": 0.0008242330513894558, + "step": 70 + }, + { + "clip_fraction": 0.0, + "completion_length": 2444.6666717529297, + "epoch": 0.08114285714285714, + "grad_norm": 0.03434319049119949, + "kl": 0.0001207888126373291, + "lambda_div_used": 0.6165208369493484, + "learning_rate": 8.392544243589427e-07, + "loss": -0.0114, + "reward": -0.04324318375438452, + "reward_after_mean": -0.04324318375438452, + "reward_after_std": 0.5684425849467516, + "reward_before_mean": 0.3016075724735856, + "reward_before_std": 0.535145154222846, + "reward_change_max": 0.0, + "reward_change_mean": -0.3448507599532604, + "reward_change_min": -0.5520102642476559, + "reward_change_std": 0.21349613554775715, + "reward_std": 0.5684425886720419, + "rewards/accuracy_reward": 0.27083334513008595, + "rewards/cosine_scaled_reward": 0.030774242244660854, + "step": 71 + }, + { + "clip_fraction": 0.0, + "completion_length": 2275.229232788086, + "epoch": 0.08228571428571428, + "grad_norm": 0.026326576247811317, + "kl": 0.0001144111156463623, + "lambda_div_used": 0.5820747911930084, + "learning_rate": 8.331941759724268e-07, + "loss": 0.0486, + "reward": -0.22072702879086137, + "reward_after_mean": -0.22072702879086137, + "reward_after_std": 0.4294308237731457, + "reward_before_mean": 0.11065018083900213, + "reward_before_std": 0.37079737335443497, + "reward_change_max": 0.0, + "reward_change_mean": -0.3313772287219763, + "reward_change_min": -0.49349113181233406, + "reward_change_std": 0.18969058711081743, + "reward_std": 0.4294308312237263, + "rewards/accuracy_reward": 0.16666667349636555, + "rewards/cosine_scaled_reward": -0.05601647589355707, + "step": 72 + }, + { + "clip_fraction": 0.0, + "completion_length": 3070.5209350585938, + "epoch": 0.08342857142857144, + "grad_norm": 0.0183473639190197, + "kl": 0.00015050172805786133, + "lambda_div_used": 0.6504631415009499, + "learning_rate": 8.270476638965461e-07, + "loss": 0.0732, + "reward": -0.024378453381359577, + "reward_after_mean": -0.024378453381359577, + "reward_after_std": 0.719476904720068, + "reward_before_mean": 0.25941105699166656, + "reward_before_std": 0.7029449231922626, + "reward_change_max": 0.0, + "reward_change_mean": -0.28378950990736485, + "reward_change_min": -0.4811365678906441, + "reward_change_std": 0.19341129437088966, + "reward_std": 0.7194769158959389, + "rewards/accuracy_reward": 0.2708333395421505, + "rewards/cosine_scaled_reward": -0.01142227090895176, + "step": 73 + }, + { + "clip_fraction": 0.0, + "completion_length": 2230.604202270508, + "epoch": 0.08457142857142858, + "grad_norm": 0.02717745117843151, + "kl": 9.462237358093262e-05, + "lambda_div_used": 0.6418861970305443, + "learning_rate": 8.208167604184217e-07, + "loss": -0.0956, + "reward": -0.0033467919565737247, + "reward_after_mean": -0.0033467919565737247, + "reward_after_std": 0.6568808052688837, + "reward_before_mean": 0.3098285049200058, + "reward_before_std": 0.6575386971235275, + "reward_change_max": 0.0, + "reward_change_mean": -0.3131752759218216, + "reward_change_min": -0.6135218031704426, + "reward_change_std": 0.22661382239311934, + "reward_std": 0.6568808313459158, + "rewards/accuracy_reward": 0.2916666753590107, + "rewards/cosine_scaled_reward": 0.018161814659833908, + "step": 74 + }, + { + "clip_fraction": 0.0, + "completion_length": 2656.416732788086, + "epoch": 0.08571428571428572, + "grad_norm": 0.018360283225774765, + "kl": 0.00010110437870025635, + "lambda_div_used": 0.5670187771320343, + "learning_rate": 8.145033635316128e-07, + "loss": -0.0053, + "reward": 0.006984639912843704, + "reward_after_mean": 0.006984639912843704, + "reward_after_std": 0.42523463629186153, + "reward_before_mean": 0.5017829714342952, + "reward_before_std": 0.3057099119760096, + "reward_change_max": 0.0, + "reward_change_mean": -0.49479835107922554, + "reward_change_min": -0.685304194688797, + "reward_change_std": 0.27397861890494823, + "reward_std": 0.4252346530556679, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/cosine_scaled_reward": 0.1684496277011931, + "step": 75 + }, + { + "clip_fraction": 0.0, + "completion_length": 2364.0000228881836, + "epoch": 0.08685714285714285, + "grad_norm": 0.026309814304113388, + "kl": 0.00010971724987030029, + "lambda_div_used": 0.5593390390276909, + "learning_rate": 8.081093963579707e-07, + "loss": 0.0787, + "reward": -0.24861154425889254, + "reward_after_mean": -0.24861154425889254, + "reward_after_std": 0.3816519398242235, + "reward_before_mean": 0.14198310300707817, + "reward_before_std": 0.2715805321931839, + "reward_change_max": 0.0, + "reward_change_mean": -0.3905946556478739, + "reward_change_min": -0.5774808749556541, + "reward_change_std": 0.21815251000225544, + "reward_std": 0.3816519435495138, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/cosine_scaled_reward": -0.02468355232849717, + "step": 76 + }, + { + "clip_fraction": 0.0, + "completion_length": 2667.750030517578, + "epoch": 0.088, + "grad_norm": 0.020327381789684296, + "kl": 0.00012464821338653564, + "lambda_div_used": 0.5539242178201675, + "learning_rate": 8.01636806561836e-07, + "loss": -0.003, + "reward": -0.2628857381641865, + "reward_after_mean": -0.2628857381641865, + "reward_after_std": 0.31219773180782795, + "reward_before_mean": 0.10476060304790735, + "reward_before_std": 0.24125095596536994, + "reward_change_max": 0.0, + "reward_change_mean": -0.36764636635780334, + "reward_change_min": -0.5086396895349026, + "reward_change_std": 0.20270386710762978, + "reward_std": 0.3121977373957634, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/cosine_scaled_reward": -0.08273938857018948, + "step": 77 + }, + { + "clip_fraction": 0.0, + "completion_length": 2686.8125534057617, + "epoch": 0.08914285714285715, + "grad_norm": 0.021180735900998116, + "kl": 0.0001368001103401184, + "lambda_div_used": 0.6677093878388405, + "learning_rate": 7.950875657567621e-07, + "loss": 0.11, + "reward": 0.1766232904046774, + "reward_after_mean": 0.1766232904046774, + "reward_after_std": 0.8389766626060009, + "reward_before_mean": 0.5535875726491213, + "reward_before_std": 0.7847605030983686, + "reward_change_max": 0.0, + "reward_change_mean": -0.3769642859697342, + "reward_change_min": -0.6617914140224457, + "reward_change_std": 0.25535117369145155, + "reward_std": 0.8389766924083233, + "rewards/accuracy_reward": 0.3958333358168602, + "rewards/cosine_scaled_reward": 0.15775423496961594, + "step": 78 + }, + { + "clip_fraction": 0.0, + "completion_length": 1962.8750457763672, + "epoch": 0.09028571428571429, + "grad_norm": 0.030601773411035538, + "kl": 8.64267349243164e-05, + "lambda_div_used": 0.6305629685521126, + "learning_rate": 7.884636689049422e-07, + "loss": -0.0874, + "reward": -0.07519742846488953, + "reward_after_mean": -0.07519742846488953, + "reward_after_std": 0.6089170537889004, + "reward_before_mean": 0.22960891388356686, + "reward_before_std": 0.611657090485096, + "reward_change_max": 0.0, + "reward_change_mean": -0.304806362837553, + "reward_change_min": -0.5647807456552982, + "reward_change_std": 0.21934652887284756, + "reward_std": 0.608917074277997, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/cosine_scaled_reward": -0.04122441209619865, + "step": 79 + }, + { + "clip_fraction": 0.0, + "completion_length": 2891.8333587646484, + "epoch": 0.09142857142857143, + "grad_norm": 0.020956117659807205, + "kl": 0.00014778971672058105, + "lambda_div_used": 0.5999866053462029, + "learning_rate": 7.817671337095244e-07, + "loss": 0.0016, + "reward": -0.013454930856823921, + "reward_after_mean": -0.013454930856823921, + "reward_after_std": 0.5396539904177189, + "reward_before_mean": 0.39186157658696175, + "reward_before_std": 0.4594053290784359, + "reward_change_max": 0.0, + "reward_change_mean": -0.4053164832293987, + "reward_change_min": -0.6001381352543831, + "reward_change_std": 0.23710554651916027, + "reward_std": 0.5396539978682995, + "rewards/accuracy_reward": 0.33333334140479565, + "rewards/cosine_scaled_reward": 0.058528220281004906, + "step": 80 + }, + { + "clip_fraction": 0.0, + "completion_length": 2756.125057220459, + "epoch": 0.09257142857142857, + "grad_norm": 0.0333174467086792, + "kl": 0.0001837015151977539, + "lambda_div_used": 0.6196143180131912, + "learning_rate": 7.75e-07, + "loss": -0.019, + "reward": -0.15753823146224022, + "reward_after_mean": -0.15753823146224022, + "reward_after_std": 0.5845062825828791, + "reward_before_mean": 0.12821976901614107, + "reward_before_std": 0.5542377643287182, + "reward_change_max": 0.0, + "reward_change_mean": -0.28575799986720085, + "reward_change_min": -0.5201962739229202, + "reward_change_std": 0.1913682147860527, + "reward_std": 0.5845062825828791, + "rewards/accuracy_reward": 0.20833334140479565, + "rewards/cosine_scaled_reward": -0.08011356927454472, + "step": 81 + }, + { + "clip_fraction": 0.0, + "completion_length": 2416.895851135254, + "epoch": 0.09371428571428571, + "grad_norm": 0.02389819547533989, + "kl": 0.00010943412780761719, + "lambda_div_used": 0.6039423421025276, + "learning_rate": 7.681643291108517e-07, + "loss": -0.0069, + "reward": -0.03246039338409901, + "reward_after_mean": -0.03246039338409901, + "reward_after_std": 0.5534762311726809, + "reward_before_mean": 0.3658472504466772, + "reward_before_std": 0.4814961114898324, + "reward_change_max": 0.0, + "reward_change_mean": -0.3983076363801956, + "reward_change_min": -0.655058030039072, + "reward_change_std": 0.24730877578258514, + "reward_std": 0.5534762516617775, + "rewards/accuracy_reward": 0.3125000074505806, + "rewards/cosine_scaled_reward": 0.05334722436964512, + "step": 82 + }, + { + "clip_fraction": 0.0, + "completion_length": 2424.3125534057617, + "epoch": 0.09485714285714286, + "grad_norm": 0.032633859664201736, + "kl": 0.00012442469596862793, + "lambda_div_used": 0.6332797482609749, + "learning_rate": 7.612622032536507e-07, + "loss": 0.0298, + "reward": 0.024711462669074535, + "reward_after_mean": 0.024711462669074535, + "reward_after_std": 0.6729160957038403, + "reward_before_mean": 0.3993762247264385, + "reward_before_std": 0.6236082511022687, + "reward_change_max": 0.0, + "reward_change_mean": -0.3746647536754608, + "reward_change_min": -0.691291693598032, + "reward_change_std": 0.2538198195397854, + "reward_std": 0.6729161199182272, + "rewards/accuracy_reward": 0.3125000037252903, + "rewards/cosine_scaled_reward": 0.08687621541321278, + "step": 83 + }, + { + "clip_fraction": 0.0, + "completion_length": 2199.125026702881, + "epoch": 0.096, + "grad_norm": 0.023145966231822968, + "kl": 9.820610284805298e-05, + "lambda_div_used": 0.6221627593040466, + "learning_rate": 7.54295724882796e-07, + "loss": -0.0603, + "reward": 0.019974265713244677, + "reward_after_mean": 0.019974265713244677, + "reward_after_std": 0.6427162848412991, + "reward_before_mean": 0.40708103217184544, + "reward_before_std": 0.5660292999818921, + "reward_change_max": 0.0, + "reward_change_mean": -0.3871067576110363, + "reward_change_min": -0.6353648640215397, + "reward_change_std": 0.23979215417057276, + "reward_std": 0.6427163053303957, + "rewards/accuracy_reward": 0.33333334140479565, + "rewards/cosine_scaled_reward": 0.07374768820591271, + "step": 84 + }, + { + "clip_fraction": 0.0, + "completion_length": 2697.812545776367, + "epoch": 0.09714285714285714, + "grad_norm": 0.018655812367796898, + "kl": 0.0001118779182434082, + "lambda_div_used": 0.6938974410295486, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0247, + "reward": 0.22309327218681574, + "reward_after_mean": 0.22309327218681574, + "reward_after_std": 0.8897394463419914, + "reward_before_mean": 0.5425103409215808, + "reward_before_std": 0.909519312903285, + "reward_change_max": 0.0, + "reward_change_mean": -0.3194170743227005, + "reward_change_min": -0.6195828355848789, + "reward_change_std": 0.24075988680124283, + "reward_std": 0.889739491045475, + "rewards/accuracy_reward": 0.3750000111758709, + "rewards/cosine_scaled_reward": 0.16751032788306475, + "step": 85 + }, + { + "clip_fraction": 0.0, + "completion_length": 2550.145851135254, + "epoch": 0.09828571428571428, + "grad_norm": 0.02583778277039528, + "kl": 0.00013977289199829102, + "lambda_div_used": 0.5981776341795921, + "learning_rate": 7.401782177833147e-07, + "loss": -0.0016, + "reward": -0.1776493340730667, + "reward_after_mean": -0.1776493340730667, + "reward_after_std": 0.4848842676728964, + "reward_before_mean": 0.13707906752824783, + "reward_before_std": 0.44980547949671745, + "reward_change_max": 0.0, + "reward_change_mean": -0.31472842022776604, + "reward_change_min": -0.5129407718777657, + "reward_change_std": 0.19486056733876467, + "reward_std": 0.4848842900246382, + "rewards/accuracy_reward": 0.16666667349636555, + "rewards/cosine_scaled_reward": -0.029587595723569393, + "step": 86 + }, + { + "clip_fraction": 0.0, + "completion_length": 2216.520866394043, + "epoch": 0.09942857142857142, + "grad_norm": 0.026928512379527092, + "kl": 0.00014719367027282715, + "lambda_div_used": 0.5552037805318832, + "learning_rate": 7.330314893841101e-07, + "loss": -0.0112, + "reward": -0.14393189689144492, + "reward_after_mean": -0.14393189689144492, + "reward_after_std": 0.41725931130349636, + "reward_before_mean": 0.3129472378641367, + "reward_before_std": 0.24625429138541222, + "reward_change_max": 0.0, + "reward_change_mean": -0.4568791352212429, + "reward_change_min": -0.6228058040142059, + "reward_change_std": 0.2342971321195364, + "reward_std": 0.41725931875407696, + "rewards/accuracy_reward": 0.27083333395421505, + "rewards/cosine_scaled_reward": 0.04211391881108284, + "step": 87 + }, + { + "clip_fraction": 0.0, + "completion_length": 1673.833366394043, + "epoch": 0.10057142857142858, + "grad_norm": 0.03403550013899803, + "kl": 9.492039680480957e-05, + "lambda_div_used": 0.6714882552623749, + "learning_rate": 7.258290078201731e-07, + "loss": 0.1357, + "reward": 0.23210743255913258, + "reward_after_mean": 0.23210743255913258, + "reward_after_std": 0.7776901721954346, + "reward_before_mean": 0.6004263032227755, + "reward_before_std": 0.8064676076173782, + "reward_change_max": 0.0, + "reward_change_mean": -0.3683188706636429, + "reward_change_min": -0.6842552609741688, + "reward_change_std": 0.2728601209819317, + "reward_std": 0.7776901982724667, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/cosine_scaled_reward": 0.16292626922950149, + "step": 88 + }, + { + "clip_fraction": 0.0, + "completion_length": 2455.875030517578, + "epoch": 0.10171428571428572, + "grad_norm": 0.02145099826157093, + "kl": 0.00010183453559875488, + "lambda_div_used": 0.6517865061759949, + "learning_rate": 7.185729670371604e-07, + "loss": 0.0088, + "reward": 0.022896312177181244, + "reward_after_mean": 0.022896312177181244, + "reward_after_std": 0.7065913639962673, + "reward_before_mean": 0.327204130589962, + "reward_before_std": 0.7111460026353598, + "reward_change_max": 0.0, + "reward_change_mean": -0.30430781841278076, + "reward_change_min": -0.5758539959788322, + "reward_change_std": 0.22174649592489004, + "reward_std": 0.7065913733094931, + "rewards/accuracy_reward": 0.27083333767950535, + "rewards/cosine_scaled_reward": 0.056370790116488934, + "step": 89 + }, + { + "clip_fraction": 0.0, + "completion_length": 2390.3125381469727, + "epoch": 0.10285714285714286, + "grad_norm": 0.04259632155299187, + "kl": 0.00015431642532348633, + "lambda_div_used": 0.5984295755624771, + "learning_rate": 7.11265577295385e-07, + "loss": 0.034, + "reward": -0.32630743458867073, + "reward_after_mean": -0.32630743458867073, + "reward_after_std": 0.5043698158115149, + "reward_before_mean": -0.07746448495890945, + "reward_before_std": 0.4487060569226742, + "reward_change_max": 0.0, + "reward_change_mean": -0.24884295091032982, + "reward_change_min": -0.42664676532149315, + "reward_change_std": 0.15124379005283117, + "reward_std": 0.5043698251247406, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/cosine_scaled_reward": -0.1607978215906769, + "step": 90 + }, + { + "clip_fraction": 0.0, + "completion_length": 2609.041702270508, + "epoch": 0.104, + "grad_norm": 0.022521013393998146, + "kl": 0.0001251697540283203, + "lambda_div_used": 0.6242434978485107, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0254, + "reward": 0.06410847418010235, + "reward_after_mean": 0.06410847418010235, + "reward_after_std": 0.6396163944154978, + "reward_before_mean": 0.4679965991526842, + "reward_before_std": 0.5784324184060097, + "reward_change_max": 0.0, + "reward_change_mean": -0.4038881305605173, + "reward_change_min": -0.6435716077685356, + "reward_change_std": 0.2524958234280348, + "reward_std": 0.6396164130419493, + "rewards/accuracy_reward": 0.35416666977107525, + "rewards/cosine_scaled_reward": 0.11382993124425411, + "step": 91 + }, + { + "clip_fraction": 0.0, + "completion_length": 2140.312526702881, + "epoch": 0.10514285714285715, + "grad_norm": 0.028272144496440887, + "kl": 8.565187454223633e-05, + "lambda_div_used": 0.6041740253567696, + "learning_rate": 6.965056695057204e-07, + "loss": -0.0131, + "reward": -0.22151808440685272, + "reward_after_mean": -0.22151808440685272, + "reward_after_std": 0.5138680338859558, + "reward_before_mean": 0.06442609056830406, + "reward_before_std": 0.48106229305267334, + "reward_change_max": 0.0, + "reward_change_mean": -0.28594417311251163, + "reward_change_min": -0.4845799170434475, + "reward_change_std": 0.1817196160554886, + "reward_std": 0.5138680376112461, + "rewards/accuracy_reward": 0.16666667349636555, + "rewards/cosine_scaled_reward": -0.10224058385938406, + "step": 92 + }, + { + "clip_fraction": 0.0, + "completion_length": 3470.3333435058594, + "epoch": 0.10628571428571429, + "grad_norm": 0.02051234431564808, + "kl": 0.00021988153457641602, + "lambda_div_used": 0.550777792930603, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0098, + "reward": -0.4400870492681861, + "reward_after_mean": -0.4400870492681861, + "reward_after_std": 0.30945760011672974, + "reward_before_mean": -0.15331347286701202, + "reward_before_std": 0.22546498104929924, + "reward_change_max": 0.0, + "reward_change_mean": -0.2867735829204321, + "reward_change_min": -0.40346677228808403, + "reward_change_std": 0.15250255912542343, + "reward_std": 0.3094576168805361, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/cosine_scaled_reward": -0.17414680309593678, + "step": 93 + }, + { + "clip_fraction": 0.0, + "completion_length": 2433.375045776367, + "epoch": 0.10742857142857143, + "grad_norm": 0.02587928995490074, + "kl": 0.0001605413854122162, + "lambda_div_used": 0.5997196212410927, + "learning_rate": 6.815672671252315e-07, + "loss": 0.0496, + "reward": -0.1711340295150876, + "reward_after_mean": -0.1711340295150876, + "reward_after_std": 0.5549158975481987, + "reward_before_mean": 0.1783520970493555, + "reward_before_std": 0.45881492272019386, + "reward_change_max": 0.0, + "reward_change_mean": -0.34948613308370113, + "reward_change_min": -0.5870647989213467, + "reward_change_std": 0.21199375297874212, + "reward_std": 0.5549159198999405, + "rewards/accuracy_reward": 0.22916666977107525, + "rewards/cosine_scaled_reward": -0.05081457580672577, + "step": 94 + }, + { + "clip_fraction": 0.0, + "completion_length": 3021.2083740234375, + "epoch": 0.10857142857142857, + "grad_norm": 0.016700398176908493, + "kl": 0.00013339519500732422, + "lambda_div_used": 0.6040999740362167, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0501, + "reward": -0.10988862998783588, + "reward_after_mean": -0.10988862998783588, + "reward_after_std": 0.574177211150527, + "reward_before_mean": 0.259714370011352, + "reward_before_std": 0.4856903199106455, + "reward_change_max": 0.0, + "reward_change_mean": -0.36960301361978054, + "reward_change_min": -0.6000017635524273, + "reward_change_std": 0.2258477583527565, + "reward_std": 0.5741772279143333, + "rewards/accuracy_reward": 0.25000000186264515, + "rewards/cosine_scaled_reward": 0.009714371990412474, + "step": 95 + }, + { + "clip_fraction": 0.0, + "completion_length": 2388.979202270508, + "epoch": 0.10971428571428571, + "grad_norm": 0.02693852037191391, + "kl": 0.00010056048631668091, + "lambda_div_used": 0.6452958509325981, + "learning_rate": 6.664685702961344e-07, + "loss": 0.0131, + "reward": 0.1582602821290493, + "reward_after_mean": 0.1582602821290493, + "reward_after_std": 0.7448761742562056, + "reward_before_mean": 0.5632272865623236, + "reward_before_std": 0.6762064695358276, + "reward_change_max": 0.0, + "reward_change_mean": -0.4049670249223709, + "reward_change_min": -0.6407857313752174, + "reward_change_std": 0.2506500957533717, + "reward_std": 0.7448761742562056, + "rewards/accuracy_reward": 0.3958333395421505, + "rewards/cosine_scaled_reward": 0.1673939572647214, + "step": 96 + }, + { + "clip_fraction": 0.0, + "completion_length": 2700.0000534057617, + "epoch": 0.11085714285714286, + "grad_norm": 0.022453829646110535, + "kl": 0.00012642145156860352, + "lambda_div_used": 0.5770404115319252, + "learning_rate": 6.588648530198504e-07, + "loss": 0.0441, + "reward": -0.24632946588099003, + "reward_after_mean": -0.24632946588099003, + "reward_after_std": 0.4705936200916767, + "reward_before_mean": 0.10453066416084766, + "reward_before_std": 0.3494142349809408, + "reward_change_max": 0.0, + "reward_change_mean": -0.35086013562977314, + "reward_change_min": -0.5019582267850637, + "reward_change_std": 0.18468604423105717, + "reward_std": 0.4705936200916767, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/cosine_scaled_reward": -0.06213599909096956, + "step": 97 + }, + { + "clip_fraction": 0.0, + "completion_length": 2192.8750534057617, + "epoch": 0.112, + "grad_norm": 0.021812062710523605, + "kl": 8.285045623779297e-05, + "lambda_div_used": 0.6265708059072495, + "learning_rate": 6.512279744547392e-07, + "loss": 0.0464, + "reward": -0.06961194425821304, + "reward_after_mean": -0.06961194425821304, + "reward_after_std": 0.705444872379303, + "reward_before_mean": 0.27397448010742664, + "reward_before_std": 0.5812770891934633, + "reward_change_max": 0.0, + "reward_change_mean": -0.3435864243656397, + "reward_change_min": -0.521237924695015, + "reward_change_std": 0.19378468580543995, + "reward_std": 0.7054448891431093, + "rewards/accuracy_reward": 0.2500000037252903, + "rewards/cosine_scaled_reward": 0.023974468291271478, + "step": 98 + }, + { + "clip_fraction": 0.0, + "completion_length": 2786.1875534057617, + "epoch": 0.11314285714285714, + "grad_norm": 0.02357642538845539, + "kl": 0.00013721734285354614, + "lambda_div_used": 0.6171735525131226, + "learning_rate": 6.435602608679916e-07, + "loss": 0.0071, + "reward": -0.024897070601582527, + "reward_after_mean": -0.024897070601582527, + "reward_after_std": 0.6280203014612198, + "reward_before_mean": 0.3600337319076061, + "reward_before_std": 0.5423067910596728, + "reward_change_max": 0.0, + "reward_change_mean": -0.3849308080971241, + "reward_change_min": -0.6169135756790638, + "reward_change_std": 0.2341146618127823, + "reward_std": 0.6280203089118004, + "rewards/accuracy_reward": 0.31250000558793545, + "rewards/cosine_scaled_reward": 0.04753373749554157, + "step": 99 + }, + { + "clip_fraction": 0.0, + "completion_length": 2387.687545776367, + "epoch": 0.11428571428571428, + "grad_norm": 0.023666124790906906, + "kl": 0.00012056529521942139, + "lambda_div_used": 0.6473532766103745, + "learning_rate": 6.358640479194451e-07, + "loss": -0.0079, + "reward": 0.10418231040239334, + "reward_after_mean": 0.10418231040239334, + "reward_after_std": 0.6740316934883595, + "reward_before_mean": 0.4593420661985874, + "reward_before_std": 0.6859642090275884, + "reward_change_max": 0.0, + "reward_change_mean": -0.3551597539335489, + "reward_change_min": -0.6693484336137772, + "reward_change_std": 0.2530285455286503, + "reward_std": 0.6740317121148109, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/cosine_scaled_reward": 0.12600870989263058, + "step": 100 + }, + { + "clip_fraction": 0.0, + "completion_length": 2082.4167098999023, + "epoch": 0.11542857142857142, + "grad_norm": 0.025257760658860207, + "kl": 0.00012496113777160645, + "lambda_div_used": 0.6337181106209755, + "learning_rate": 6.281416799501187e-07, + "loss": 0.0127, + "reward": 0.08042715396732092, + "reward_after_mean": 0.08042715396732092, + "reward_after_std": 0.6870364677160978, + "reward_before_mean": 0.4686305020004511, + "reward_before_std": 0.6316956970840693, + "reward_change_max": 0.0, + "reward_change_mean": -0.3882033359259367, + "reward_change_min": -0.6386316269636154, + "reward_change_std": 0.2518463246524334, + "reward_std": 0.6870364770293236, + "rewards/accuracy_reward": 0.33333333767950535, + "rewards/cosine_scaled_reward": 0.13529715640470386, + "step": 101 + }, + { + "clip_fraction": 0.0, + "completion_length": 2037.208381652832, + "epoch": 0.11657142857142858, + "grad_norm": 0.030245469883084297, + "kl": 0.00012689828872680664, + "lambda_div_used": 0.6057270467281342, + "learning_rate": 6.203955092681039e-07, + "loss": 0.0419, + "reward": 0.04565976280719042, + "reward_after_mean": 0.04565976280719042, + "reward_after_std": 0.58235695771873, + "reward_before_mean": 0.49184186570346355, + "reward_before_std": 0.48894889652729034, + "reward_change_max": 0.0, + "reward_change_mean": -0.4461820814758539, + "reward_change_min": -0.7175954841077328, + "reward_change_std": 0.27023847959935665, + "reward_std": 0.5823569800704718, + "rewards/accuracy_reward": 0.3125000074505806, + "rewards/cosine_scaled_reward": 0.17934184102341533, + "step": 102 + }, + { + "clip_fraction": 0.0, + "completion_length": 2204.291717529297, + "epoch": 0.11771428571428572, + "grad_norm": 0.035654906183481216, + "kl": 0.00011385977268218994, + "lambda_div_used": 0.5589818432927132, + "learning_rate": 6.126278954320294e-07, + "loss": -0.0085, + "reward": -0.2170967198908329, + "reward_after_mean": -0.2170967198908329, + "reward_after_std": 0.37913205102086067, + "reward_before_mean": 0.17684321105480194, + "reward_before_std": 0.26896010898053646, + "reward_change_max": 0.0, + "reward_change_mean": -0.3939399253576994, + "reward_change_min": -0.5747586265206337, + "reward_change_std": 0.2185236681252718, + "reward_std": 0.3791320640593767, + "rewards/accuracy_reward": 0.1875, + "rewards/cosine_scaled_reward": -0.010656798258423805, + "step": 103 + }, + { + "clip_fraction": 0.0, + "completion_length": 2460.1458435058594, + "epoch": 0.11885714285714286, + "grad_norm": 0.034317746758461, + "kl": 0.00014722347259521484, + "lambda_div_used": 0.5736617371439934, + "learning_rate": 6.048412045323164e-07, + "loss": -0.0349, + "reward": -0.1729673482477665, + "reward_after_mean": -0.1729673482477665, + "reward_after_std": 0.44365703873336315, + "reward_before_mean": 0.2049333555623889, + "reward_before_std": 0.3348153894767165, + "reward_change_max": 0.0, + "reward_change_mean": -0.3779007289558649, + "reward_change_min": -0.5351628288626671, + "reward_change_std": 0.20819698367267847, + "reward_std": 0.4436570517718792, + "rewards/accuracy_reward": 0.22916666977107525, + "rewards/cosine_scaled_reward": -0.024233306758105755, + "step": 104 + }, + { + "clip_fraction": 0.0, + "completion_length": 2288.958381652832, + "epoch": 0.12, + "grad_norm": 0.02615894190967083, + "kl": 0.00012151896953582764, + "lambda_div_used": 0.6090050563216209, + "learning_rate": 5.97037808470444e-07, + "loss": 0.0354, + "reward": 0.04345609247684479, + "reward_after_mean": 0.04345609247684479, + "reward_after_std": 0.5673400796949863, + "reward_before_mean": 0.4590000305324793, + "reward_before_std": 0.5081056347116828, + "reward_change_max": 0.0, + "reward_change_mean": -0.41554390639066696, + "reward_change_min": -0.6591046005487442, + "reward_change_std": 0.25887916050851345, + "reward_std": 0.5673400945961475, + "rewards/accuracy_reward": 0.3333333358168602, + "rewards/cosine_scaled_reward": 0.1256666723638773, + "step": 105 + }, + { + "clip_fraction": 0.0, + "completion_length": 1890.4791870117188, + "epoch": 0.12114285714285715, + "grad_norm": 0.02484404481947422, + "kl": 5.9254467487335205e-05, + "lambda_div_used": 0.5908190160989761, + "learning_rate": 5.892200842364462e-07, + "loss": -0.031, + "reward": 0.13268680218607187, + "reward_after_mean": 0.13268680218607187, + "reward_after_std": 0.5557667016983032, + "reward_before_mean": 0.6536997258663177, + "reward_before_std": 0.4149925457313657, + "reward_change_max": 0.0, + "reward_change_mean": -0.5210129152983427, + "reward_change_min": -0.7558874487876892, + "reward_change_std": 0.2945323744788766, + "reward_std": 0.555766711011529, + "rewards/accuracy_reward": 0.4791666753590107, + "rewards/cosine_scaled_reward": 0.1745330523699522, + "step": 106 + }, + { + "clip_fraction": 0.0, + "completion_length": 2767.75004196167, + "epoch": 0.12228571428571429, + "grad_norm": 0.02508847787976265, + "kl": 0.00016683340072631836, + "lambda_div_used": 0.5527122691273689, + "learning_rate": 5.813904131848564e-07, + "loss": 0.0551, + "reward": -0.10423782840371132, + "reward_after_mean": -0.10423782840371132, + "reward_after_std": 0.40749385207891464, + "reward_before_mean": 0.3917595148086548, + "reward_before_std": 0.23519209399819374, + "reward_change_max": 0.0, + "reward_change_mean": -0.4959973506629467, + "reward_change_min": -0.6841400265693665, + "reward_change_std": 0.26082153245806694, + "reward_std": 0.40749386698007584, + "rewards/accuracy_reward": 0.3125, + "rewards/cosine_scaled_reward": 0.07925950735807419, + "step": 107 + }, + { + "clip_fraction": 0.0, + "completion_length": 2765.1041870117188, + "epoch": 0.12342857142857143, + "grad_norm": 0.01993192359805107, + "kl": 0.0001267939805984497, + "lambda_div_used": 0.5970002189278603, + "learning_rate": 5.735511803093248e-07, + "loss": -0.0109, + "reward": 0.0014873668551445007, + "reward_after_mean": 0.0014873668551445007, + "reward_after_std": 0.5301447622478008, + "reward_before_mean": 0.4201008062809706, + "reward_before_std": 0.44587238878011703, + "reward_change_max": 0.0, + "reward_change_mean": -0.41861344687640667, + "reward_change_min": -0.6076503656804562, + "reward_change_std": 0.24174897000193596, + "reward_std": 0.5301447845995426, + "rewards/accuracy_reward": 0.33333334140479565, + "rewards/cosine_scaled_reward": 0.08676748792640865, + "step": 108 + }, + { + "clip_fraction": 0.0, + "completion_length": 2777.8541946411133, + "epoch": 0.12457142857142857, + "grad_norm": 0.023633325472474098, + "kl": 0.0001358315348625183, + "lambda_div_used": 0.5594293028116226, + "learning_rate": 5.657047735161255e-07, + "loss": -0.0301, + "reward": -0.27439120411872864, + "reward_after_mean": -0.27439120411872864, + "reward_after_std": 0.3761340919882059, + "reward_before_mean": 0.09332936629652977, + "reward_before_std": 0.26834795251488686, + "reward_change_max": 0.0, + "reward_change_mean": -0.36772056482732296, + "reward_change_min": -0.5622114911675453, + "reward_change_std": 0.20468105003237724, + "reward_std": 0.37613409385085106, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/cosine_scaled_reward": -0.07333731185644865, + "step": 109 + }, + { + "clip_fraction": 0.0, + "completion_length": 2538.7917098999023, + "epoch": 0.12571428571428572, + "grad_norm": 0.028952863067388535, + "kl": 0.00011619925498962402, + "lambda_div_used": 0.6277726292610168, + "learning_rate": 5.578535828967777e-07, + "loss": -0.0225, + "reward": 0.13001340767368674, + "reward_after_mean": 0.13001340767368674, + "reward_after_std": 0.6052438467741013, + "reward_before_mean": 0.5293129477649927, + "reward_before_std": 0.5916427094489336, + "reward_change_max": 0.0, + "reward_change_mean": -0.3992995321750641, + "reward_change_min": -0.6062962152063847, + "reward_change_std": 0.25030655320733786, + "reward_std": 0.6052438709884882, + "rewards/accuracy_reward": 0.41666668467223644, + "rewards/cosine_scaled_reward": 0.1126462584361434, + "step": 110 + }, + { + "clip_fraction": 0.0, + "completion_length": 2590.833351135254, + "epoch": 0.12685714285714286, + "grad_norm": 0.023588471114635468, + "kl": 0.00016444921493530273, + "lambda_div_used": 0.6258220672607422, + "learning_rate": 5.5e-07, + "loss": 0.0326, + "reward": 0.05748726427555084, + "reward_after_mean": 0.05748726427555084, + "reward_after_std": 0.6602962389588356, + "reward_before_mean": 0.4434679429978132, + "reward_before_std": 0.5859999302774668, + "reward_change_max": 0.0, + "reward_change_mean": -0.3859807029366493, + "reward_change_min": -0.6280175969004631, + "reward_change_std": 0.24166050180792809, + "reward_std": 0.6602962575852871, + "rewards/accuracy_reward": 0.33333333767950535, + "rewards/cosine_scaled_reward": 0.11013460718095303, + "step": 111 + }, + { + "clip_fraction": 0.0, + "completion_length": 2659.3750915527344, + "epoch": 0.128, + "grad_norm": 0.022469794377684593, + "kl": 0.0001360177993774414, + "lambda_div_used": 0.6310129314661026, + "learning_rate": 5.421464171032224e-07, + "loss": 0.0414, + "reward": 0.12392518669366837, + "reward_after_mean": 0.12392518669366837, + "reward_after_std": 0.6471672505140305, + "reward_before_mean": 0.5420562420040369, + "reward_before_std": 0.6111889835447073, + "reward_change_max": 0.0, + "reward_change_mean": -0.4181310646235943, + "reward_change_min": -0.6937783919274807, + "reward_change_std": 0.27559296786785126, + "reward_std": 0.6471672654151917, + "rewards/accuracy_reward": 0.3958333395421505, + "rewards/cosine_scaled_reward": 0.14622290851548314, + "step": 112 + }, + { + "clip_fraction": 0.0, + "completion_length": 1887.0000534057617, + "epoch": 0.12914285714285714, + "grad_norm": 0.034688595682382584, + "kl": 0.0001182258129119873, + "lambda_div_used": 0.6175974532961845, + "learning_rate": 5.342952264838747e-07, + "loss": 0.0045, + "reward": -0.10040622856467962, + "reward_after_mean": -0.10040622856467962, + "reward_after_std": 0.5642315912991762, + "reward_before_mean": 0.21734675765037537, + "reward_before_std": 0.5426071379333735, + "reward_change_max": 0.0, + "reward_change_mean": -0.31775298714637756, + "reward_change_min": -0.5628380477428436, + "reward_change_std": 0.21154501475393772, + "reward_std": 0.564231613650918, + "rewards/accuracy_reward": 0.22916667349636555, + "rewards/cosine_scaled_reward": -0.011819899722468108, + "step": 113 + }, + { + "clip_fraction": 0.0, + "completion_length": 2079.3541870117188, + "epoch": 0.13028571428571428, + "grad_norm": 0.023916104808449745, + "kl": 8.407607674598694e-05, + "lambda_div_used": 0.5991180911660194, + "learning_rate": 5.264488196906752e-07, + "loss": 0.0147, + "reward": -0.25019002705812454, + "reward_after_mean": -0.25019002705812454, + "reward_after_std": 0.48966687358915806, + "reward_before_mean": 0.01878603477962315, + "reward_before_std": 0.4579437389038503, + "reward_change_max": 0.0, + "reward_change_mean": -0.26897607184946537, + "reward_change_min": -0.4482330121099949, + "reward_change_std": 0.17346476390957832, + "reward_std": 0.4896668866276741, + "rewards/accuracy_reward": 0.16666666977107525, + "rewards/cosine_scaled_reward": -0.1478806473314762, + "step": 114 + }, + { + "clip_fraction": 0.0, + "completion_length": 2811.6041984558105, + "epoch": 0.13142857142857142, + "grad_norm": 0.02382122538983822, + "kl": 0.00011992454528808594, + "lambda_div_used": 0.6031630709767342, + "learning_rate": 5.186095868151436e-07, + "loss": 0.0298, + "reward": -0.024587277323007584, + "reward_after_mean": -0.024587277323007584, + "reward_after_std": 0.5156112629920244, + "reward_before_mean": 0.36486465483903885, + "reward_before_std": 0.47580901626497507, + "reward_change_max": 0.0, + "reward_change_mean": -0.38945191726088524, + "reward_change_min": -0.6165403127670288, + "reward_change_std": 0.24257665127515793, + "reward_std": 0.5156112778931856, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/cosine_scaled_reward": 0.01069796271622181, + "step": 115 + }, + { + "clip_fraction": 0.0, + "completion_length": 3174.166679382324, + "epoch": 0.13257142857142856, + "grad_norm": 0.0280291810631752, + "kl": 0.000163152813911438, + "lambda_div_used": 0.5893594026565552, + "learning_rate": 5.107799157635538e-07, + "loss": -0.0065, + "reward": -0.2329910285770893, + "reward_after_mean": -0.2329910285770893, + "reward_after_std": 0.42214493826031685, + "reward_before_mean": 0.07756753638386726, + "reward_before_std": 0.4099442269653082, + "reward_change_max": 0.0, + "reward_change_mean": -0.31055857613682747, + "reward_change_min": -0.5211558938026428, + "reward_change_std": 0.2011605817824602, + "reward_std": 0.42214495688676834, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/cosine_scaled_reward": -0.06826579943299294, + "step": 116 + }, + { + "clip_fraction": 0.0, + "completion_length": 2800.854232788086, + "epoch": 0.1337142857142857, + "grad_norm": 0.021884813904762268, + "kl": 0.00016610324382781982, + "lambda_div_used": 0.5740129947662354, + "learning_rate": 5.02962191529556e-07, + "loss": -0.0155, + "reward": -0.4151143445633352, + "reward_after_mean": -0.4151143445633352, + "reward_after_std": 0.41484479792416096, + "reward_before_mean": -0.17339750938117504, + "reward_before_std": 0.3358896663412452, + "reward_change_max": 0.0, + "reward_change_mean": -0.24171681702136993, + "reward_change_min": -0.35830606892704964, + "reward_change_std": 0.12972851190716028, + "reward_std": 0.41484480164945126, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/cosine_scaled_reward": -0.21506418148055673, + "step": 117 + }, + { + "clip_fraction": 0.0, + "completion_length": 2905.0000610351562, + "epoch": 0.13485714285714287, + "grad_norm": 0.019685380160808563, + "kl": 0.00012448430061340332, + "lambda_div_used": 0.6509419903159142, + "learning_rate": 4.951587954676837e-07, + "loss": 0.0699, + "reward": 0.37886764854192734, + "reward_after_mean": 0.37886764854192734, + "reward_after_std": 0.8599284738302231, + "reward_before_mean": 0.9192078877240419, + "reward_before_std": 0.7036966122686863, + "reward_change_max": 0.0, + "reward_change_mean": -0.54034024477005, + "reward_change_min": -0.8373602591454983, + "reward_change_std": 0.3177654892206192, + "reward_std": 0.8599285036325455, + "rewards/accuracy_reward": 0.5833333395421505, + "rewards/cosine_scaled_reward": 0.33587456680834293, + "step": 118 + }, + { + "clip_fraction": 0.0, + "completion_length": 1735.0416946411133, + "epoch": 0.136, + "grad_norm": 0.03752947598695755, + "kl": 9.926781058311462e-05, + "lambda_div_used": 0.615598551928997, + "learning_rate": 4.873721045679706e-07, + "loss": 0.0198, + "reward": 0.11371836951002479, + "reward_after_mean": 0.11371836951002479, + "reward_after_std": 0.608397152274847, + "reward_before_mean": 0.5552113465964794, + "reward_before_std": 0.5343325138092041, + "reward_change_max": 0.0, + "reward_change_mean": -0.44149295426905155, + "reward_change_min": -0.6740178428590298, + "reward_change_std": 0.26589817740023136, + "reward_std": 0.6083971671760082, + "rewards/accuracy_reward": 0.37500000931322575, + "rewards/cosine_scaled_reward": 0.18021131958812475, + "step": 119 + }, + { + "clip_fraction": 0.0, + "completion_length": 2124.3958702087402, + "epoch": 0.13714285714285715, + "grad_norm": 0.03277261555194855, + "kl": 0.00016069412231445312, + "lambda_div_used": 0.621355876326561, + "learning_rate": 4.79604490731896e-07, + "loss": -0.0313, + "reward": 0.030127520207315683, + "reward_after_mean": 0.030127520207315683, + "reward_after_std": 0.6104090996086597, + "reward_before_mean": 0.3965782462619245, + "reward_before_std": 0.5573655245825648, + "reward_change_max": 0.0, + "reward_change_mean": -0.36645070649683475, + "reward_change_min": -0.5546461082994938, + "reward_change_std": 0.21761172730475664, + "reward_std": 0.6104091145098209, + "rewards/accuracy_reward": 0.3541666753590107, + "rewards/cosine_scaled_reward": 0.04241155949421227, + "step": 120 + }, + { + "clip_fraction": 0.0, + "completion_length": 1674.895866394043, + "epoch": 0.1382857142857143, + "grad_norm": 0.028996704146265984, + "kl": 0.00010481476783752441, + "lambda_div_used": 0.5694864094257355, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.0117, + "reward": -0.15189427509903908, + "reward_after_mean": -0.15189427509903908, + "reward_after_std": 0.4316890323534608, + "reward_before_mean": 0.2471799086779356, + "reward_before_std": 0.3141373130492866, + "reward_change_max": 0.0, + "reward_change_mean": -0.3990741856396198, + "reward_change_min": -0.5547297224402428, + "reward_change_std": 0.21437653806060553, + "reward_std": 0.43168904818594456, + "rewards/accuracy_reward": 0.25000000558793545, + "rewards/cosine_scaled_reward": -0.002820094407070428, + "step": 121 + }, + { + "clip_fraction": 0.0, + "completion_length": 2674.1666870117188, + "epoch": 0.13942857142857143, + "grad_norm": 0.02413249760866165, + "kl": 0.00017333030700683594, + "lambda_div_used": 0.5866860523819923, + "learning_rate": 4.641359520805548e-07, + "loss": -0.0399, + "reward": 0.04396146908402443, + "reward_after_mean": 0.04396146908402443, + "reward_after_std": 0.5285588596016169, + "reward_before_mean": 0.5353102702647448, + "reward_before_std": 0.40454091038554907, + "reward_change_max": 0.0, + "reward_change_mean": -0.4913488235324621, + "reward_change_min": -0.7136706411838531, + "reward_change_std": 0.2844190578907728, + "reward_std": 0.5285588726401329, + "rewards/accuracy_reward": 0.3958333358168602, + "rewards/cosine_scaled_reward": 0.13947694562375546, + "step": 122 + }, + { + "clip_fraction": 0.0, + "completion_length": 2401.270881652832, + "epoch": 0.14057142857142857, + "grad_norm": 0.021569611504673958, + "kl": 0.00012353062629699707, + "lambda_div_used": 0.560750350356102, + "learning_rate": 4.5643973913200837e-07, + "loss": -0.0352, + "reward": -0.11011217907071114, + "reward_after_mean": -0.11011217907071114, + "reward_after_std": 0.43483646027743816, + "reward_before_mean": 0.3471109885722399, + "reward_before_std": 0.2799733504652977, + "reward_change_max": 0.0, + "reward_change_mean": -0.4572231862694025, + "reward_change_min": -0.6473490297794342, + "reward_change_std": 0.2491364972665906, + "reward_std": 0.4348364770412445, + "rewards/accuracy_reward": 0.3125, + "rewards/cosine_scaled_reward": 0.034610994160175323, + "step": 123 + }, + { + "clip_fraction": 0.0, + "completion_length": 2023.1875381469727, + "epoch": 0.1417142857142857, + "grad_norm": 0.024433298036456108, + "kl": 8.402764797210693e-05, + "lambda_div_used": 0.5741237699985504, + "learning_rate": 4.4877202554526084e-07, + "loss": 0.0177, + "reward": 0.11256878264248371, + "reward_after_mean": 0.11256878264248371, + "reward_after_std": 0.5534880999475718, + "reward_before_mean": 0.7005790947005153, + "reward_before_std": 0.34279677364975214, + "reward_change_max": 0.0, + "reward_change_mean": -0.5880102626979351, + "reward_change_min": -0.8437007814645767, + "reward_change_std": 0.32156766299158335, + "reward_std": 0.5534881185740232, + "rewards/accuracy_reward": 0.4583333358168602, + "rewards/cosine_scaled_reward": 0.2422457179054618, + "step": 124 + }, + { + "clip_fraction": 0.0, + "completion_length": 2390.12504196167, + "epoch": 0.14285714285714285, + "grad_norm": 0.027825474739074707, + "kl": 9.434670209884644e-05, + "lambda_div_used": 0.5816653594374657, + "learning_rate": 4.4113514698014953e-07, + "loss": -0.1441, + "reward": -0.022026576101779938, + "reward_after_mean": -0.022026576101779938, + "reward_after_std": 0.5128289703279734, + "reward_before_mean": 0.4427599459886551, + "reward_before_std": 0.37189827114343643, + "reward_change_max": 0.0, + "reward_change_mean": -0.4647865351289511, + "reward_change_min": -0.6719168424606323, + "reward_change_std": 0.2539686840027571, + "reward_std": 0.5128289721906185, + "rewards/accuracy_reward": 0.33333333395421505, + "rewards/cosine_scaled_reward": 0.10942662274464965, + "step": 125 + }, + { + "clip_fraction": 0.0, + "completion_length": 2348.625030517578, + "epoch": 0.144, + "grad_norm": 0.024593623355031013, + "kl": 0.0001001209020614624, + "lambda_div_used": 0.5944257900118828, + "learning_rate": 4.3353142970386557e-07, + "loss": 0.0235, + "reward": -0.036125872284173965, + "reward_after_mean": -0.036125872284173965, + "reward_after_std": 0.5458226818591356, + "reward_before_mean": 0.3889114623889327, + "reward_before_std": 0.43509659357368946, + "reward_change_max": 0.0, + "reward_change_mean": -0.4250373411923647, + "reward_change_min": -0.6385823003947735, + "reward_change_std": 0.24260270595550537, + "reward_std": 0.5458226818591356, + "rewards/accuracy_reward": 0.29166667349636555, + "rewards/cosine_scaled_reward": 0.09724476374685764, + "step": 126 + }, + { + "clip_fraction": 0.0, + "completion_length": 3143.229217529297, + "epoch": 0.14514285714285713, + "grad_norm": 0.021421613171696663, + "kl": 0.0001538693904876709, + "lambda_div_used": 0.5594507232308388, + "learning_rate": 4.2596318988235037e-07, + "loss": -0.0322, + "reward": -0.4113082066178322, + "reward_after_mean": -0.4113082066178322, + "reward_after_std": 0.34261589869856834, + "reward_before_mean": -0.14389780722558498, + "reward_before_std": 0.269620718434453, + "reward_change_max": 0.0, + "reward_change_mean": -0.2674104031175375, + "reward_change_min": -0.4196575991809368, + "reward_change_std": 0.15403888188302517, + "reward_std": 0.3426159042865038, + "rewards/accuracy_reward": 0.0625, + "rewards/cosine_scaled_reward": -0.20639780443161726, + "step": 127 + }, + { + "clip_fraction": 0.0, + "completion_length": 2329.5208740234375, + "epoch": 0.1462857142857143, + "grad_norm": 0.025876455008983612, + "kl": 0.00014778971672058105, + "lambda_div_used": 0.5925442725419998, + "learning_rate": 4.1843273287476854e-07, + "loss": -0.0218, + "reward": 0.04296835511922836, + "reward_after_mean": 0.04296835511922836, + "reward_after_std": 0.5716155916452408, + "reward_before_mean": 0.5175326284952462, + "reward_before_std": 0.4212250765413046, + "reward_change_max": 0.0, + "reward_change_mean": -0.4745642766356468, + "reward_change_min": -0.6585596464574337, + "reward_change_std": 0.25283501856029034, + "reward_std": 0.571615606546402, + "rewards/accuracy_reward": 0.3958333395421505, + "rewards/cosine_scaled_reward": 0.12169927265495062, + "step": 128 + }, + { + "clip_fraction": 0.0, + "completion_length": 3137.4583892822266, + "epoch": 0.14742857142857144, + "grad_norm": 0.018007410690188408, + "kl": 0.00014001131057739258, + "lambda_div_used": 0.6392767652869225, + "learning_rate": 4.1094235253127374e-07, + "loss": 0.0746, + "reward": -0.024870820343494415, + "reward_after_mean": -0.024870820343494415, + "reward_after_std": 0.6713957078754902, + "reward_before_mean": 0.2818590197712183, + "reward_before_std": 0.6434567552059889, + "reward_change_max": 0.0, + "reward_change_mean": -0.30672984197735786, + "reward_change_min": -0.5197948329150677, + "reward_change_std": 0.20186646562069654, + "reward_std": 0.6713957078754902, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/cosine_scaled_reward": 0.031859016977250576, + "step": 129 + }, + { + "clip_fraction": 0.0, + "completion_length": 3087.104202270508, + "epoch": 0.14857142857142858, + "grad_norm": 0.020643344148993492, + "kl": 0.00017881393432617188, + "lambda_div_used": 0.6044124737381935, + "learning_rate": 4.034943304942796e-07, + "loss": -0.0175, + "reward": -0.250907301902771, + "reward_after_mean": -0.250907301902771, + "reward_after_std": 0.514793710783124, + "reward_before_mean": 0.01573239639401436, + "reward_before_std": 0.4839063249528408, + "reward_change_max": 0.0, + "reward_change_mean": -0.26663970574736595, + "reward_change_min": -0.5019430406391621, + "reward_change_std": 0.1815296784043312, + "reward_std": 0.5147937145084143, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/cosine_scaled_reward": -0.1301009338349104, + "step": 130 + }, + { + "clip_fraction": 0.0, + "completion_length": 2290.5000381469727, + "epoch": 0.14971428571428572, + "grad_norm": 0.025495275855064392, + "kl": 0.00014261901378631592, + "lambda_div_used": 0.585864968597889, + "learning_rate": 3.9609093550344907e-07, + "loss": -0.0345, + "reward": -0.033165013417601585, + "reward_after_mean": -0.033165013417601585, + "reward_after_std": 0.5260650478303432, + "reward_before_mean": 0.42901195771992207, + "reward_before_std": 0.3989385652821511, + "reward_change_max": 0.0, + "reward_change_mean": -0.4621769953519106, + "reward_change_min": -0.7094772532582283, + "reward_change_std": 0.26999812573194504, + "reward_std": 0.5260650608688593, + "rewards/accuracy_reward": 0.3541666679084301, + "rewards/cosine_scaled_reward": 0.07484527863562107, + "step": 131 + }, + { + "clip_fraction": 0.0, + "completion_length": 2628.979202270508, + "epoch": 0.15085714285714286, + "grad_norm": 0.021410632878541946, + "kl": 0.00011786073446273804, + "lambda_div_used": 0.6330231353640556, + "learning_rate": 3.8873442270461485e-07, + "loss": -0.0163, + "reward": 0.21169579401612282, + "reward_after_mean": 0.21169579401612282, + "reward_after_std": 0.6052236501127481, + "reward_before_mean": 0.6431907135993242, + "reward_before_std": 0.611154742538929, + "reward_change_max": 0.0, + "reward_change_mean": -0.43149489536881447, + "reward_change_min": -0.7013396099209785, + "reward_change_std": 0.279757896438241, + "reward_std": 0.605223661288619, + "rewards/accuracy_reward": 0.4375000186264515, + "rewards/cosine_scaled_reward": 0.20569069124758244, + "step": 132 + }, + { + "clip_fraction": 0.0, + "completion_length": 3043.916702270508, + "epoch": 0.152, + "grad_norm": 0.020860377699136734, + "kl": 0.00016830861568450928, + "lambda_div_used": 0.577904686331749, + "learning_rate": 3.8142703296283953e-07, + "loss": -0.0229, + "reward": -0.311251699924469, + "reward_after_mean": -0.311251699924469, + "reward_after_std": 0.40735830925405025, + "reward_before_mean": -0.022546445950865746, + "reward_before_std": 0.35403214395046234, + "reward_change_max": 0.0, + "reward_change_mean": -0.28870525024831295, + "reward_change_min": -0.4795113056898117, + "reward_change_std": 0.175603779964149, + "reward_std": 0.40735832042992115, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/cosine_scaled_reward": -0.16837978060357273, + "step": 133 + }, + { + "clip_fraction": 0.0, + "completion_length": 2296.2500228881836, + "epoch": 0.15314285714285714, + "grad_norm": 0.025448989123106003, + "kl": 0.00012351572513580322, + "lambda_div_used": 0.6253047212958336, + "learning_rate": 3.7417099217982686e-07, + "loss": 0.0232, + "reward": 0.1032534665428102, + "reward_after_mean": 0.1032534665428102, + "reward_after_std": 0.6201032679527998, + "reward_before_mean": 0.527151208370924, + "reward_before_std": 0.5803719013929367, + "reward_change_max": 0.0, + "reward_change_mean": -0.4238977525383234, + "reward_change_min": -0.7053604945540428, + "reward_change_std": 0.27356533519923687, + "reward_std": 0.6201032791286707, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/cosine_scaled_reward": 0.1313178651034832, + "step": 134 + }, + { + "clip_fraction": 0.0, + "completion_length": 1375.6667098999023, + "epoch": 0.15428571428571428, + "grad_norm": 0.042060088366270065, + "kl": 7.014349102973938e-05, + "lambda_div_used": 0.6157513931393623, + "learning_rate": 3.6696851061588994e-07, + "loss": -0.0865, + "reward": 0.32518790662288666, + "reward_after_mean": 0.32518790662288666, + "reward_after_std": 0.7028943486511707, + "reward_before_mean": 0.9168294770643115, + "reward_before_std": 0.5377248618751764, + "reward_change_max": 0.0, + "reward_change_mean": -0.5916415732353926, + "reward_change_min": -0.8611635379493237, + "reward_change_std": 0.3413122948259115, + "reward_std": 0.7028943561017513, + "rewards/accuracy_reward": 0.6041666734963655, + "rewards/cosine_scaled_reward": 0.31266278121620417, + "step": 135 + }, + { + "clip_fraction": 0.0, + "completion_length": 2304.375030517578, + "epoch": 0.15542857142857142, + "grad_norm": 0.0248978603631258, + "kl": 0.00010488927364349365, + "lambda_div_used": 0.6134930327534676, + "learning_rate": 3.5982178221668533e-07, + "loss": -0.0298, + "reward": 0.14305459149181843, + "reward_after_mean": 0.14305459149181843, + "reward_after_std": 0.6303485874086618, + "reward_before_mean": 0.6280363164842129, + "reward_before_std": 0.5314226988703012, + "reward_change_max": 0.0, + "reward_change_mean": -0.4849817119538784, + "reward_change_min": -0.7693819738924503, + "reward_change_std": 0.29691250063478947, + "reward_std": 0.6303485967218876, + "rewards/accuracy_reward": 0.4375000037252903, + "rewards/cosine_scaled_reward": 0.19053628714755177, + "step": 136 + }, + { + "clip_fraction": 0.0, + "completion_length": 2849.312530517578, + "epoch": 0.15657142857142858, + "grad_norm": 0.019165532663464546, + "kl": 0.00011494755744934082, + "lambda_div_used": 0.6259770095348358, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0001, + "reward": -0.2035258673131466, + "reward_after_mean": -0.2035258673131466, + "reward_after_std": 0.6123348288238049, + "reward_before_mean": 0.05552574759349227, + "reward_before_std": 0.5897263735532761, + "reward_change_max": 0.0, + "reward_change_mean": -0.25905160419642925, + "reward_change_min": -0.49531829729676247, + "reward_change_std": 0.184982025064528, + "reward_std": 0.6123348399996758, + "rewards/accuracy_reward": 0.16666666977107525, + "rewards/cosine_scaled_reward": -0.11114091548370197, + "step": 137 + }, + { + "clip_fraction": 0.0, + "completion_length": 2511.0625381469727, + "epoch": 0.15771428571428572, + "grad_norm": 0.02206576056778431, + "kl": 9.766221046447754e-05, + "lambda_div_used": 0.5779839232563972, + "learning_rate": 3.45704275117204e-07, + "loss": -0.0123, + "reward": -0.23054278269410133, + "reward_after_mean": -0.23054278269410133, + "reward_after_std": 0.47196367010474205, + "reward_before_mean": 0.11714623775333166, + "reward_before_std": 0.35360280703753233, + "reward_change_max": 0.0, + "reward_change_mean": -0.34768899716436863, + "reward_change_min": -0.5065655931830406, + "reward_change_std": 0.18736570980399847, + "reward_std": 0.4719636719673872, + "rewards/accuracy_reward": 0.20833333395421505, + "rewards/cosine_scaled_reward": -0.09118711110204458, + "step": 138 + }, + { + "clip_fraction": 0.0, + "completion_length": 2817.166702270508, + "epoch": 0.15885714285714286, + "grad_norm": 0.025601239874958992, + "kl": 0.00013655424118041992, + "lambda_div_used": 0.6219307482242584, + "learning_rate": 3.387377967463493e-07, + "loss": -0.0354, + "reward": -0.07538405619561672, + "reward_after_mean": -0.07538405619561672, + "reward_after_std": 0.5752126723527908, + "reward_before_mean": 0.24527974613010883, + "reward_before_std": 0.5667949663475156, + "reward_change_max": 0.0, + "reward_change_mean": -0.3206638339906931, + "reward_change_min": -0.556602880358696, + "reward_change_std": 0.21935877669602633, + "reward_std": 0.5752126909792423, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/cosine_scaled_reward": -0.02555358811514452, + "step": 139 + }, + { + "clip_fraction": 0.0, + "completion_length": 3020.0833740234375, + "epoch": 0.16, + "grad_norm": 0.020280320197343826, + "kl": 0.00016957521438598633, + "lambda_div_used": 0.623006746172905, + "learning_rate": 3.3183567088914833e-07, + "loss": 0.0263, + "reward": 0.048921750858426094, + "reward_after_mean": 0.048921750858426094, + "reward_after_std": 0.6539230048656464, + "reward_before_mean": 0.46376091009005904, + "reward_before_std": 0.5685575436800718, + "reward_change_max": 0.0, + "reward_change_mean": -0.4148391764611006, + "reward_change_min": -0.6308448016643524, + "reward_change_std": 0.2505591865628958, + "reward_std": 0.6539230197668076, + "rewards/accuracy_reward": 0.3750000074505806, + "rewards/cosine_scaled_reward": 0.08876088261604309, + "step": 140 + }, + { + "clip_fraction": 0.0, + "completion_length": 2796.916732788086, + "epoch": 0.16114285714285714, + "grad_norm": 0.019921308383345604, + "kl": 0.00011113286018371582, + "lambda_div_used": 0.6025099903345108, + "learning_rate": 3.250000000000001e-07, + "loss": 0.0527, + "reward": -0.28436860628426075, + "reward_after_mean": -0.28436860628426075, + "reward_after_std": 0.5148810762912035, + "reward_before_mean": -0.02774716354906559, + "reward_before_std": 0.46978663094341755, + "reward_change_max": 0.0, + "reward_change_mean": -0.2566214445978403, + "reward_change_min": -0.41403992287814617, + "reward_change_std": 0.15458690002560616, + "reward_std": 0.5148810893297195, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/cosine_scaled_reward": -0.1527471598237753, + "step": 141 + }, + { + "clip_fraction": 0.0, + "completion_length": 2619.1458587646484, + "epoch": 0.16228571428571428, + "grad_norm": 0.019752761349081993, + "kl": 0.00013134628534317017, + "lambda_div_used": 0.595375120639801, + "learning_rate": 3.182328662904756e-07, + "loss": -0.0208, + "reward": -0.15498719364404678, + "reward_after_mean": -0.15498719364404678, + "reward_after_std": 0.5081784036010504, + "reward_before_mean": 0.1970929354429245, + "reward_before_std": 0.4446716960519552, + "reward_change_max": 0.0, + "reward_change_mean": -0.3520801328122616, + "reward_change_min": -0.6091671586036682, + "reward_change_std": 0.22200345993041992, + "reward_std": 0.508178411051631, + "rewards/accuracy_reward": 0.2291666679084301, + "rewards/cosine_scaled_reward": -0.032073733396828175, + "step": 142 + }, + { + "clip_fraction": 0.0, + "completion_length": 2416.2500762939453, + "epoch": 0.16342857142857142, + "grad_norm": 0.02633557841181755, + "kl": 0.00012940168380737305, + "lambda_div_used": 0.6070521473884583, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0473, + "reward": -0.1759424265474081, + "reward_after_mean": -0.1759424265474081, + "reward_after_std": 0.5199048612266779, + "reward_before_mean": 0.1274722833186388, + "reward_before_std": 0.4973313231021166, + "reward_change_max": 0.0, + "reward_change_mean": -0.3034147098660469, + "reward_change_min": -0.5472985841333866, + "reward_change_std": 0.20386416278779507, + "reward_std": 0.5199048724025488, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/cosine_scaled_reward": -0.03919439576566219, + "step": 143 + }, + { + "clip_fraction": 0.0, + "completion_length": 2497.0416946411133, + "epoch": 0.16457142857142856, + "grad_norm": 0.035163138061761856, + "kl": 0.00010834634304046631, + "lambda_div_used": 0.5917740762233734, + "learning_rate": 3.0491243424323783e-07, + "loss": 0.0908, + "reward": 0.24967988207936287, + "reward_after_mean": 0.24967988207936287, + "reward_after_std": 0.5445964094251394, + "reward_before_mean": 0.8210294228047132, + "reward_before_std": 0.4194592139683664, + "reward_change_max": 0.0, + "reward_change_mean": -0.5713495649397373, + "reward_change_min": -0.7989893518388271, + "reward_change_std": 0.3215014720335603, + "reward_std": 0.5445964206010103, + "rewards/accuracy_reward": 0.541666679084301, + "rewards/cosine_scaled_reward": 0.2793627381324768, + "step": 144 + }, + { + "clip_fraction": 0.0, + "completion_length": 1880.750015258789, + "epoch": 0.1657142857142857, + "grad_norm": 0.03040655143558979, + "kl": 9.316205978393555e-05, + "lambda_div_used": 0.6018117442727089, + "learning_rate": 2.9836319343816397e-07, + "loss": -0.0063, + "reward": 0.0018871724605560303, + "reward_after_mean": 0.0018871724605560303, + "reward_after_std": 0.6153606176376343, + "reward_before_mean": 0.45608025789260864, + "reward_before_std": 0.47093176282942295, + "reward_change_max": 0.0, + "reward_change_mean": -0.45419312454760075, + "reward_change_min": -0.7000111788511276, + "reward_change_std": 0.26173546724021435, + "reward_std": 0.6153606250882149, + "rewards/accuracy_reward": 0.35416666977107525, + "rewards/cosine_scaled_reward": 0.10191359603777528, + "step": 145 + }, + { + "clip_fraction": 0.0, + "completion_length": 1995.1041793823242, + "epoch": 0.16685714285714287, + "grad_norm": 0.02280835248529911, + "kl": 9.866058826446533e-05, + "lambda_div_used": 0.5822854116559029, + "learning_rate": 2.918906036420294e-07, + "loss": 0.0731, + "reward": -0.3418470360338688, + "reward_after_mean": -0.3418470360338688, + "reward_after_std": 0.4274127297103405, + "reward_before_mean": -0.07948943041265011, + "reward_before_std": 0.38051687460392714, + "reward_change_max": 0.0, + "reward_change_mean": -0.2623576056212187, + "reward_change_min": -0.453898411244154, + "reward_change_std": 0.1681989086791873, + "reward_std": 0.4274127408862114, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/cosine_scaled_reward": -0.18365611135959625, + "step": 146 + }, + { + "clip_fraction": 0.0, + "completion_length": 3414.4166870117188, + "epoch": 0.168, + "grad_norm": 0.017901504412293434, + "kl": 0.00017184019088745117, + "lambda_div_used": 0.6236077323555946, + "learning_rate": 2.854966364683872e-07, + "loss": 0.0225, + "reward": -0.0833415687084198, + "reward_after_mean": -0.0833415687084198, + "reward_after_std": 0.5858507957309484, + "reward_before_mean": 0.216837452724576, + "reward_before_std": 0.5764442849904299, + "reward_change_max": 0.0, + "reward_change_mean": -0.30017900839447975, + "reward_change_min": -0.5037183798849583, + "reward_change_std": 0.2055044947192073, + "reward_std": 0.5858508311212063, + "rewards/accuracy_reward": 0.22916666977107525, + "rewards/cosine_scaled_reward": -0.012329218676313758, + "step": 147 + }, + { + "clip_fraction": 0.0, + "completion_length": 2291.000030517578, + "epoch": 0.16914285714285715, + "grad_norm": 0.021171187981963158, + "kl": 0.00012093409895896912, + "lambda_div_used": 0.5844326093792915, + "learning_rate": 2.791832395815782e-07, + "loss": 0.0386, + "reward": -0.15509312599897385, + "reward_after_mean": -0.15509312599897385, + "reward_after_std": 0.48159872740507126, + "reward_before_mean": 0.22943633235991, + "reward_before_std": 0.39098774176090956, + "reward_change_max": 0.0, + "reward_change_mean": -0.3845294751226902, + "reward_change_min": -0.6122906021773815, + "reward_change_std": 0.23033427819609642, + "reward_std": 0.48159876093268394, + "rewards/accuracy_reward": 0.2291666679084301, + "rewards/cosine_scaled_reward": 0.00026967376470565796, + "step": 148 + }, + { + "clip_fraction": 0.0, + "completion_length": 2509.0416870117188, + "epoch": 0.1702857142857143, + "grad_norm": 0.020010385662317276, + "kl": 8.110702037811279e-05, + "lambda_div_used": 0.5822181403636932, + "learning_rate": 2.729523361034538e-07, + "loss": 0.0208, + "reward": 0.08701876550912857, + "reward_after_mean": 0.08701876550912857, + "reward_after_std": 0.5091588757932186, + "reward_before_mean": 0.598341865465045, + "reward_before_std": 0.3779993327334523, + "reward_change_max": 0.0, + "reward_change_mean": -0.5113231185823679, + "reward_change_min": -0.740135669708252, + "reward_change_std": 0.2875883989036083, + "reward_std": 0.5091589000076056, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/cosine_scaled_reward": 0.18167520873248577, + "step": 149 + }, + { + "clip_fraction": 0.0, + "completion_length": 2679.208381652832, + "epoch": 0.17142857142857143, + "grad_norm": 0.025659440085291862, + "kl": 0.00014823675155639648, + "lambda_div_used": 0.6290425136685371, + "learning_rate": 2.6680582402757324e-07, + "loss": -0.0233, + "reward": -0.049642632249742746, + "reward_after_mean": -0.049642632249742746, + "reward_after_std": 0.5986230112612247, + "reward_before_mean": 0.2686304301023483, + "reward_before_std": 0.6011289358139038, + "reward_change_max": 0.0, + "reward_change_mean": -0.31827306374907494, + "reward_change_min": -0.5840066187083721, + "reward_change_std": 0.2246640883386135, + "reward_std": 0.598623014986515, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/cosine_scaled_reward": -0.002202920615673065, + "step": 150 + }, + { + "clip_fraction": 0.0, + "completion_length": 2462.5625610351562, + "epoch": 0.17257142857142857, + "grad_norm": 0.03043326921761036, + "kl": 0.0001488029956817627, + "lambda_div_used": 0.6573176011443138, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0929, + "reward": 0.2124087940901518, + "reward_after_mean": 0.2124087940901518, + "reward_after_std": 0.762018321081996, + "reward_before_mean": 0.6105309925042093, + "reward_before_std": 0.7428378090262413, + "reward_change_max": 0.0, + "reward_change_mean": -0.39812218956649303, + "reward_change_min": -0.6643032841384411, + "reward_change_std": 0.271884405054152, + "reward_std": 0.7620183527469635, + "rewards/accuracy_reward": 0.43750000558793545, + "rewards/cosine_scaled_reward": 0.17303097806870937, + "step": 151 + }, + { + "clip_fraction": 0.0, + "completion_length": 2846.875072479248, + "epoch": 0.1737142857142857, + "grad_norm": 0.02812943048775196, + "kl": 0.00018167495727539062, + "lambda_div_used": 0.599166102707386, + "learning_rate": 2.547734369542718e-07, + "loss": -0.0069, + "reward": -0.32120730075985193, + "reward_after_mean": -0.32120730075985193, + "reward_after_std": 0.5156350377947092, + "reward_before_mean": -0.06628246325999498, + "reward_before_std": 0.4535220582038164, + "reward_change_max": 0.0, + "reward_change_mean": -0.25492484122514725, + "reward_change_min": -0.42022984474897385, + "reward_change_std": 0.14956693351268768, + "reward_std": 0.5156350489705801, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/cosine_scaled_reward": -0.1496157981455326, + "step": 152 + }, + { + "clip_fraction": 0.0, + "completion_length": 2625.770896911621, + "epoch": 0.17485714285714285, + "grad_norm": 0.02562631107866764, + "kl": 0.0001496821641921997, + "lambda_div_used": 0.5755600407719612, + "learning_rate": 2.488912271385139e-07, + "loss": 0.0383, + "reward": -0.2726967688649893, + "reward_after_mean": -0.2726967688649893, + "reward_after_std": 0.43943885155022144, + "reward_before_mean": 0.054511758498847485, + "reward_before_std": 0.3466099677607417, + "reward_change_max": 0.0, + "reward_change_mean": -0.3272085413336754, + "reward_change_min": -0.4877549596130848, + "reward_change_std": 0.18566382955759764, + "reward_std": 0.4394388683140278, + "rewards/accuracy_reward": 0.20833333395421505, + "rewards/cosine_scaled_reward": -0.15382158383727074, + "step": 153 + }, + { + "clip_fraction": 0.0, + "completion_length": 2930.2083587646484, + "epoch": 0.176, + "grad_norm": 0.01869855262339115, + "kl": 0.00013381242752075195, + "lambda_div_used": 0.6487007588148117, + "learning_rate": 2.4310073797187573e-07, + "loss": 0.0074, + "reward": 0.17745468392968178, + "reward_after_mean": 0.17745468392968178, + "reward_after_std": 0.6683135256171227, + "reward_before_mean": 0.554833997040987, + "reward_before_std": 0.6959163639694452, + "reward_change_max": 0.0, + "reward_change_mean": -0.37737933173775673, + "reward_change_min": -0.6365409940481186, + "reward_change_std": 0.26569664292037487, + "reward_std": 0.6683135367929935, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/cosine_scaled_reward": 0.15900065936148167, + "step": 154 + }, + { + "clip_fraction": 0.0, + "completion_length": 2324.7083587646484, + "epoch": 0.17714285714285713, + "grad_norm": 0.03042282536625862, + "kl": 0.0001443326473236084, + "lambda_div_used": 0.6021355092525482, + "learning_rate": 2.374037332934512e-07, + "loss": -0.0708, + "reward": 0.013361499644815922, + "reward_after_mean": 0.013361499644815922, + "reward_after_std": 0.608528571203351, + "reward_before_mean": 0.4701185021549463, + "reward_before_std": 0.4722642693668604, + "reward_change_max": 0.0, + "reward_change_mean": -0.4567570425570011, + "reward_change_min": -0.728898536413908, + "reward_change_std": 0.2705167792737484, + "reward_std": 0.6085285805165768, + "rewards/accuracy_reward": 0.3958333358168602, + "rewards/cosine_scaled_reward": 0.07428519520908594, + "step": 155 + }, + { + "clip_fraction": 0.0, + "completion_length": 2742.041702270508, + "epoch": 0.1782857142857143, + "grad_norm": 0.023997837677598, + "kl": 0.00013341009616851807, + "lambda_div_used": 0.6111876517534256, + "learning_rate": 2.3180194846605364e-07, + "loss": 0.0104, + "reward": -0.11058625392615795, + "reward_after_mean": -0.11058625392615795, + "reward_after_std": 0.5431522708386183, + "reward_before_mean": 0.20433677232358605, + "reward_before_std": 0.5068034324795008, + "reward_change_max": 0.0, + "reward_change_mean": -0.31492303870618343, + "reward_change_min": -0.4863894209265709, + "reward_change_std": 0.18972175009548664, + "reward_std": 0.5431522782891989, + "rewards/accuracy_reward": 0.2291666753590107, + "rewards/cosine_scaled_reward": -0.024829893372952938, + "step": 156 + }, + { + "clip_fraction": 0.0, + "completion_length": 2665.875045776367, + "epoch": 0.17942857142857144, + "grad_norm": 0.02073371410369873, + "kl": 0.00014007091522216797, + "lambda_div_used": 0.5806600153446198, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.0406, + "reward": -0.2642064723186195, + "reward_after_mean": -0.2642064723186195, + "reward_after_std": 0.48669449612498283, + "reward_before_mean": 0.06258813291788101, + "reward_before_std": 0.3662749119102955, + "reward_change_max": 0.0, + "reward_change_mean": -0.32679460756480694, + "reward_change_min": -0.473216038197279, + "reward_change_std": 0.1716562630608678, + "reward_std": 0.48669449612498283, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/cosine_scaled_reward": -0.10407854370714631, + "step": 157 + }, + { + "clip_fraction": 0.0, + "completion_length": 2338.8333740234375, + "epoch": 0.18057142857142858, + "grad_norm": 0.02651275135576725, + "kl": 0.00011517666280269623, + "lambda_div_used": 0.6631434112787247, + "learning_rate": 2.2089083427137329e-07, + "loss": 0.0501, + "reward": 0.14301904384046793, + "reward_after_mean": 0.14301904384046793, + "reward_after_std": 0.8312356304377317, + "reward_before_mean": 0.5320943212136626, + "reward_before_std": 0.7637526150792837, + "reward_change_max": 0.0, + "reward_change_mean": -0.38907529041171074, + "reward_change_min": -0.694700576364994, + "reward_change_std": 0.26256909035146236, + "reward_std": 0.8312356378883123, + "rewards/accuracy_reward": 0.3750000074505806, + "rewards/cosine_scaled_reward": 0.15709431585855782, + "step": 158 + }, + { + "clip_fraction": 0.0, + "completion_length": 3239.312530517578, + "epoch": 0.18171428571428572, + "grad_norm": 0.016524603590369225, + "kl": 0.0001583099365234375, + "lambda_div_used": 0.623667947947979, + "learning_rate": 2.1558482853517253e-07, + "loss": -0.0341, + "reward": -0.1187703013420105, + "reward_after_mean": -0.1187703013420105, + "reward_after_std": 0.5951940100640059, + "reward_before_mean": 0.16929386125411838, + "reward_before_std": 0.5728897508233786, + "reward_change_max": 0.0, + "reward_change_mean": -0.28806419111788273, + "reward_change_min": -0.4665379598736763, + "reward_change_std": 0.1831390606239438, + "reward_std": 0.5951940137892962, + "rewards/accuracy_reward": 0.2291666753590107, + "rewards/cosine_scaled_reward": -0.05987279675900936, + "step": 159 + }, + { + "clip_fraction": 0.0, + "completion_length": 2803.312530517578, + "epoch": 0.18285714285714286, + "grad_norm": 0.026071852073073387, + "kl": 0.00017073750495910645, + "lambda_div_used": 0.6305168867111206, + "learning_rate": 2.1038068889975259e-07, + "loss": -0.0459, + "reward": 0.010341526940464973, + "reward_after_mean": 0.010341526940464973, + "reward_after_std": 0.6011195741593838, + "reward_before_mean": 0.3497283663600683, + "reward_before_std": 0.6088744457811117, + "reward_change_max": 0.0, + "reward_change_mean": -0.33938686549663544, + "reward_change_min": -0.5917558334767818, + "reward_change_std": 0.2366197258234024, + "reward_std": 0.6011195983737707, + "rewards/accuracy_reward": 0.2916666753590107, + "rewards/cosine_scaled_reward": 0.05806170590221882, + "step": 160 + }, + { + "clip_fraction": 0.0, + "completion_length": 2208.9167098999023, + "epoch": 0.184, + "grad_norm": 0.0240344051271677, + "kl": 0.000129062682390213, + "lambda_div_used": 0.6582028865814209, + "learning_rate": 2.0528000059645995e-07, + "loss": 0.0279, + "reward": 0.07936648279428482, + "reward_after_mean": 0.07936648279428482, + "reward_after_std": 0.7546874471008778, + "reward_before_mean": 0.40869135939283296, + "reward_before_std": 0.7339576873928308, + "reward_change_max": 0.0, + "reward_change_mean": -0.32932490296661854, + "reward_change_min": -0.6056464668363333, + "reward_change_std": 0.22207134775817394, + "reward_std": 0.7546874955296516, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/cosine_scaled_reward": 0.05452469550073147, + "step": 161 + }, + { + "clip_fraction": 0.0, + "completion_length": 3135.0208892822266, + "epoch": 0.18514285714285714, + "grad_norm": 0.022219210863113403, + "kl": 0.0001678466796875, + "lambda_div_used": 0.5934342220425606, + "learning_rate": 2.0028431734436308e-07, + "loss": -0.0607, + "reward": -0.046884071081876755, + "reward_after_mean": -0.046884071081876755, + "reward_after_std": 0.5110116824507713, + "reward_before_mean": 0.3527396023273468, + "reward_before_std": 0.4275300269946456, + "reward_change_max": 0.0, + "reward_change_mean": -0.39962366595864296, + "reward_change_min": -0.5722533725202084, + "reward_change_std": 0.22872111946344376, + "reward_std": 0.5110117141157389, + "rewards/accuracy_reward": 0.33333334140479565, + "rewards/cosine_scaled_reward": 0.019406253471970558, + "step": 162 + }, + { + "clip_fraction": 0.0, + "completion_length": 2319.9791870117188, + "epoch": 0.18628571428571428, + "grad_norm": 0.025694716721773148, + "kl": 0.00014371052384376526, + "lambda_div_used": 0.5704625844955444, + "learning_rate": 1.9539516087697517e-07, + "loss": 0.0916, + "reward": 0.07464592158794403, + "reward_after_mean": 0.07464592158794403, + "reward_after_std": 0.49546825513243675, + "reward_before_mean": 0.6325086355209351, + "reward_before_std": 0.32235115580260754, + "reward_change_max": 0.0, + "reward_change_mean": -0.557862676680088, + "reward_change_min": -0.7940906882286072, + "reward_change_std": 0.3045828063040972, + "reward_std": 0.49546825885772705, + "rewards/accuracy_reward": 0.4375, + "rewards/cosine_scaled_reward": 0.19500861689448357, + "step": 163 + }, + { + "clip_fraction": 0.0, + "completion_length": 2167.750045776367, + "epoch": 0.18742857142857142, + "grad_norm": 0.0288226380944252, + "kl": 0.0001278519630432129, + "lambda_div_used": 0.6208588480949402, + "learning_rate": 1.9061402047871833e-07, + "loss": 0.0045, + "reward": 0.052167763307807036, + "reward_after_mean": 0.052167763307807036, + "reward_after_std": 0.627906009554863, + "reward_before_mean": 0.44521861523389816, + "reward_before_std": 0.5596300046890974, + "reward_change_max": 0.0, + "reward_change_mean": -0.39305083081126213, + "reward_change_min": -0.6213876642286777, + "reward_change_std": 0.24184285942465067, + "reward_std": 0.6279060393571854, + "rewards/accuracy_reward": 0.35416667349636555, + "rewards/cosine_scaled_reward": 0.09105192590504885, + "step": 164 + }, + { + "clip_fraction": 0.0, + "completion_length": 2823.8125228881836, + "epoch": 0.18857142857142858, + "grad_norm": 0.026144707575440407, + "kl": 0.00016139447689056396, + "lambda_div_used": 0.5820390656590462, + "learning_rate": 1.8594235253127372e-07, + "loss": -0.0708, + "reward": -0.32857649284414947, + "reward_after_mean": -0.32857649284414947, + "reward_after_std": 0.4344688355922699, + "reward_before_mean": -0.0503513365983963, + "reward_before_std": 0.3760820124298334, + "reward_change_max": 0.0, + "reward_change_mean": -0.2782251574099064, + "reward_change_min": -0.43433111906051636, + "reward_change_std": 0.16556962952017784, + "reward_std": 0.4344688393175602, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/cosine_scaled_reward": -0.13368466403335333, + "step": 165 + }, + { + "clip_fraction": 0.0, + "completion_length": 2491.0208587646484, + "epoch": 0.18971428571428572, + "grad_norm": 0.019254038110375404, + "kl": 0.0001147836446762085, + "lambda_div_used": 0.6493343263864517, + "learning_rate": 1.8138158006995363e-07, + "loss": 0.0025, + "reward": 0.10031834430992603, + "reward_after_mean": 0.10031834430992603, + "reward_after_std": 0.6912827659398317, + "reward_before_mean": 0.4514310024678707, + "reward_before_std": 0.6998845022171736, + "reward_change_max": 0.0, + "reward_change_mean": -0.351112674921751, + "reward_change_min": -0.6428324580192566, + "reward_change_std": 0.2517909351736307, + "reward_std": 0.6912827901542187, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/cosine_scaled_reward": 0.11809766665101051, + "step": 166 + }, + { + "clip_fraction": 0.0, + "completion_length": 2146.645896911621, + "epoch": 0.19085714285714286, + "grad_norm": 0.028959238901734352, + "kl": 0.00012630224227905273, + "lambda_div_used": 0.602773554623127, + "learning_rate": 1.7693309235023127e-07, + "loss": -0.0687, + "reward": -0.08888023532927036, + "reward_after_mean": -0.08888023532927036, + "reward_after_std": 0.555552402511239, + "reward_before_mean": 0.284214471001178, + "reward_before_std": 0.4734340328723192, + "reward_change_max": 0.0, + "reward_change_mean": -0.37309471145272255, + "reward_change_min": -0.5994942858815193, + "reward_change_std": 0.2264004945755005, + "reward_std": 0.5555524323135614, + "rewards/accuracy_reward": 0.27083333767950535, + "rewards/cosine_scaled_reward": 0.013381102122366428, + "step": 167 + }, + { + "clip_fraction": 0.0, + "completion_length": 2591.562545776367, + "epoch": 0.192, + "grad_norm": 0.022079093381762505, + "kl": 0.00013020634651184082, + "lambda_div_used": 0.6182359680533409, + "learning_rate": 1.7259824442455923e-07, + "loss": 0.0571, + "reward": 0.07187426090240479, + "reward_after_mean": 0.07187426090240479, + "reward_after_std": 0.6350691560655832, + "reward_before_mean": 0.4906120039522648, + "reward_before_std": 0.5473381988704205, + "reward_change_max": 0.0, + "reward_change_mean": -0.4187377579510212, + "reward_change_min": -0.6163501553237438, + "reward_change_std": 0.24177053570747375, + "reward_std": 0.6350691728293896, + "rewards/accuracy_reward": 0.3541666753590107, + "rewards/cosine_scaled_reward": 0.1364453360438347, + "step": 168 + }, + { + "clip_fraction": 0.0, + "completion_length": 1961.333366394043, + "epoch": 0.19314285714285714, + "grad_norm": 0.024920228868722916, + "kl": 0.00011189281940460205, + "lambda_div_used": 0.5712982937693596, + "learning_rate": 1.6837835672960831e-07, + "loss": -0.037, + "reward": 0.20125045720487833, + "reward_after_mean": 0.20125045720487833, + "reward_after_std": 0.5916534103453159, + "reward_before_mean": 0.8435525028035045, + "reward_before_std": 0.32216374203562737, + "reward_change_max": 0.0, + "reward_change_mean": -0.6423020549118519, + "reward_change_min": -0.836066972464323, + "reward_change_std": 0.32226173765957355, + "reward_std": 0.5916534326970577, + "rewards/accuracy_reward": 0.5416666679084301, + "rewards/cosine_scaled_reward": 0.3018858137074858, + "step": 169 + }, + { + "clip_fraction": 0.0, + "completion_length": 2154.5417404174805, + "epoch": 0.19428571428571428, + "grad_norm": 0.028055250644683838, + "kl": 9.50545072555542e-05, + "lambda_div_used": 0.5548974648118019, + "learning_rate": 1.6427471468404952e-07, + "loss": 0.0414, + "reward": -0.1357494406402111, + "reward_after_mean": -0.1357494406402111, + "reward_after_std": 0.41400698386132717, + "reward_before_mean": 0.32883553951978683, + "reward_before_std": 0.2495311009697616, + "reward_change_max": 0.0, + "reward_change_mean": -0.4645849745720625, + "reward_change_min": -0.6499762162566185, + "reward_change_std": 0.250681190751493, + "reward_std": 0.41400699876248837, + "rewards/accuracy_reward": 0.3125, + "rewards/cosine_scaled_reward": 0.01633552461862564, + "step": 170 + }, + { + "clip_fraction": 0.0, + "completion_length": 2344.3958587646484, + "epoch": 0.19542857142857142, + "grad_norm": 0.02545234002172947, + "kl": 0.00012035667896270752, + "lambda_div_used": 0.5686006918549538, + "learning_rate": 1.6028856829700258e-07, + "loss": -0.0076, + "reward": 0.025929288007318974, + "reward_after_mean": 0.025929288007318974, + "reward_after_std": 0.4821996595710516, + "reward_before_mean": 0.5422459719702601, + "reward_before_std": 0.3103277189657092, + "reward_change_max": 0.0, + "reward_change_mean": -0.5163167044520378, + "reward_change_min": -0.6874858625233173, + "reward_change_std": 0.26997776329517365, + "reward_std": 0.4821996670216322, + "rewards/accuracy_reward": 0.37500000558793545, + "rewards/cosine_scaled_reward": 0.1672459738329053, + "step": 171 + }, + { + "clip_fraction": 0.0, + "completion_length": 2580.3958740234375, + "epoch": 0.19657142857142856, + "grad_norm": 0.03461524099111557, + "kl": 0.00015616416931152344, + "lambda_div_used": 0.5815886929631233, + "learning_rate": 8.487667956935087e-07, + "loss": -0.0386, + "reward": -0.02652345411479473, + "reward_after_mean": -0.02652345411479473, + "reward_after_std": 0.5259725619107485, + "reward_before_mean": 0.4387537483125925, + "reward_before_std": 0.3750908151268959, + "reward_change_max": 0.0, + "reward_change_mean": -0.46527721732854843, + "reward_change_min": -0.6695724055171013, + "reward_change_std": 0.2563530644401908, + "reward_std": 0.5259725674986839, + "rewards/accuracy_reward": 0.33333333395421505, + "rewards/cosine_scaled_reward": 0.10542040364816785, + "step": 172 + }, + { + "clip_fraction": 0.0, + "completion_length": 1932.0625305175781, + "epoch": 0.1977142857142857, + "grad_norm": 0.03550613671541214, + "kl": 9.726732969284058e-05, + "lambda_div_used": 0.5629367232322693, + "learning_rate": 8.464102570534061e-07, + "loss": -0.0198, + "reward": -0.33315238857176155, + "reward_after_mean": -0.33315238857176155, + "reward_after_std": 0.35940456483513117, + "reward_before_mean": -0.03457173053175211, + "reward_before_std": 0.2833498573163524, + "reward_change_max": 0.0, + "reward_change_mean": -0.29858064092695713, + "reward_change_min": -0.4200097434222698, + "reward_change_std": 0.16404641512781382, + "reward_std": 0.35940458066761494, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/cosine_scaled_reward": -0.15957174729555845, + "step": 173 + }, + { + "clip_fraction": 0.0, + "completion_length": 1809.7708587646484, + "epoch": 0.19885714285714284, + "grad_norm": 0.03420722112059593, + "kl": 0.00013248249888420105, + "lambda_div_used": 0.6139687895774841, + "learning_rate": 8.440392717955475e-07, + "loss": -0.0816, + "reward": -0.07395929284393787, + "reward_after_mean": -0.07395929284393787, + "reward_after_std": 0.5732789468020201, + "reward_before_mean": 0.25243946351110935, + "reward_before_std": 0.5262010591104627, + "reward_change_max": 0.0, + "reward_change_mean": -0.3263987563550472, + "reward_change_min": -0.5132386535406113, + "reward_change_std": 0.20316704735159874, + "reward_std": 0.5732789561152458, + "rewards/accuracy_reward": 0.22916667349636555, + "rewards/cosine_scaled_reward": 0.02327278454322368, + "step": 174 + }, + { + "clip_fraction": 0.0, + "completion_length": 2270.7500495910645, + "epoch": 0.2, + "grad_norm": 0.023624489083886147, + "kl": 0.00011564046144485474, + "lambda_div_used": 0.570957601070404, + "learning_rate": 8.416539554784089e-07, + "loss": 0.0018, + "reward": -0.09137872606515884, + "reward_after_mean": -0.09137872606515884, + "reward_after_std": 0.4245176389813423, + "reward_before_mean": 0.3394407369196415, + "reward_before_std": 0.3214457123540342, + "reward_change_max": 0.0, + "reward_change_mean": -0.4308194350451231, + "reward_change_min": -0.6172507330775261, + "reward_change_std": 0.24237936083227396, + "reward_std": 0.4245176613330841, + "rewards/accuracy_reward": 0.3125000074505806, + "rewards/cosine_scaled_reward": 0.026940705254673958, + "step": 175 + }, + { + "clip_fraction": 0.0, + "completion_length": 2428.3750534057617, + "epoch": 0.20114285714285715, + "grad_norm": 0.03552456945180893, + "kl": 9.998679161071777e-05, + "lambda_div_used": 0.6270494386553764, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0567, + "reward": 0.23782964330166578, + "reward_after_mean": 0.23782964330166578, + "reward_after_std": 0.6844992376863956, + "reward_before_mean": 0.7466199137270451, + "reward_before_std": 0.594361359719187, + "reward_change_max": 0.0, + "reward_change_mean": -0.5087902657687664, + "reward_change_min": -0.8128968887031078, + "reward_change_std": 0.31682352535426617, + "reward_std": 0.6844992693513632, + "rewards/accuracy_reward": 0.5000000074505806, + "rewards/cosine_scaled_reward": 0.24661988578736782, + "step": 176 + }, + { + "clip_fraction": 0.0, + "completion_length": 2709.5833740234375, + "epoch": 0.2022857142857143, + "grad_norm": 0.022738253697752953, + "kl": 0.0001655668020248413, + "lambda_div_used": 0.6092279329895973, + "learning_rate": 8.368407953869103e-07, + "loss": 0.0278, + "reward": -0.16157673671841621, + "reward_after_mean": -0.16157673671841621, + "reward_after_std": 0.5301279928535223, + "reward_before_mean": 0.14143561571836472, + "reward_before_std": 0.5103836972266436, + "reward_change_max": 0.0, + "reward_change_mean": -0.303012328222394, + "reward_change_min": -0.5294999107718468, + "reward_change_std": 0.2066562958061695, + "reward_std": 0.530128002166748, + "rewards/accuracy_reward": 0.1875, + "rewards/cosine_scaled_reward": -0.04606438986957073, + "step": 177 + }, + { + "clip_fraction": 0.0, + "completion_length": 2521.562545776367, + "epoch": 0.20342857142857143, + "grad_norm": 0.023333929479122162, + "kl": 0.00010882318019866943, + "lambda_div_used": 0.5983341336250305, + "learning_rate": 8.344131861991828e-07, + "loss": 0.0305, + "reward": -0.0399339459836483, + "reward_after_mean": -0.0399339459836483, + "reward_after_std": 0.544251000508666, + "reward_before_mean": 0.369808804243803, + "reward_before_std": 0.4518149495124817, + "reward_change_max": 0.0, + "reward_change_mean": -0.4097427297383547, + "reward_change_min": -0.6259582564234734, + "reward_change_std": 0.23724547680467367, + "reward_std": 0.5442510098218918, + "rewards/accuracy_reward": 0.29166667349636555, + "rewards/cosine_scaled_reward": 0.07814211072400212, + "step": 178 + }, + { + "clip_fraction": 0.0, + "completion_length": 2703.7083740234375, + "epoch": 0.20457142857142857, + "grad_norm": 0.023093275725841522, + "kl": 0.00013177096843719482, + "lambda_div_used": 0.5762727931141853, + "learning_rate": 8.319717151140072e-07, + "loss": 0.0879, + "reward": -0.20519665256142616, + "reward_after_mean": -0.20519665256142616, + "reward_after_std": 0.39140512235462666, + "reward_before_mean": 0.13847777154296637, + "reward_before_std": 0.3475749148055911, + "reward_change_max": 0.0, + "reward_change_mean": -0.3436744213104248, + "reward_change_min": -0.5246328189969063, + "reward_change_std": 0.20370345003902912, + "reward_std": 0.39140513353049755, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/cosine_scaled_reward": -0.049022228457033634, + "step": 179 + }, + { + "clip_fraction": 0.0, + "completion_length": 2075.3750228881836, + "epoch": 0.2057142857142857, + "grad_norm": 0.03283306583762169, + "kl": 0.00013710558414459229, + "lambda_div_used": 0.5492931827902794, + "learning_rate": 8.295165011252396e-07, + "loss": -0.0592, + "reward": -0.09344155341386795, + "reward_after_mean": -0.09344155341386795, + "reward_after_std": 0.37022680789232254, + "reward_before_mean": 0.39771560952067375, + "reward_before_std": 0.21920094243250787, + "reward_change_max": 0.0, + "reward_change_mean": -0.4911571964621544, + "reward_change_min": -0.6641882658004761, + "reward_change_std": 0.25923535134643316, + "reward_std": 0.3702268172055483, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/cosine_scaled_reward": 0.0435489546507597, + "step": 180 + }, + { + "clip_fraction": 0.0, + "completion_length": 2976.875030517578, + "epoch": 0.20685714285714285, + "grad_norm": 0.02316705696284771, + "kl": 0.00017173588275909424, + "lambda_div_used": 0.594053827226162, + "learning_rate": 8.270476638965461e-07, + "loss": -0.0029, + "reward": -0.17979225050657988, + "reward_after_mean": -0.17979225050657988, + "reward_after_std": 0.5341140031814575, + "reward_before_mean": 0.17803902877494693, + "reward_before_std": 0.4284058129414916, + "reward_change_max": 0.0, + "reward_change_mean": -0.3578312788158655, + "reward_change_min": -0.5442017950117588, + "reward_change_std": 0.2014410514384508, + "reward_std": 0.5341140106320381, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/cosine_scaled_reward": -0.03029430890455842, + "step": 181 + }, + { + "clip_fraction": 0.0, + "completion_length": 1833.0833930969238, + "epoch": 0.208, + "grad_norm": 0.02471437305212021, + "kl": 6.746500730514526e-05, + "lambda_div_used": 0.6185052543878555, + "learning_rate": 8.245653237555705e-07, + "loss": -0.0541, + "reward": 0.10658288560807705, + "reward_after_mean": 0.10658288560807705, + "reward_after_std": 0.6189861167222261, + "reward_before_mean": 0.5465792864561081, + "reward_before_std": 0.5544852227903903, + "reward_change_max": 0.0, + "reward_change_mean": -0.43999641202390194, + "reward_change_min": -0.7109990864992142, + "reward_change_std": 0.2778801778331399, + "reward_std": 0.618986152112484, + "rewards/accuracy_reward": 0.4375000037252903, + "rewards/cosine_scaled_reward": 0.10907927341759205, + "step": 182 + }, + { + "clip_fraction": 0.0, + "completion_length": 1834.4375686645508, + "epoch": 0.20914285714285713, + "grad_norm": 0.026828886941075325, + "kl": 8.349120616912842e-05, + "lambda_div_used": 0.6341942846775055, + "learning_rate": 8.220696016880687e-07, + "loss": 0.0145, + "reward": 0.10971941862953827, + "reward_after_mean": 0.10971941862953827, + "reward_after_std": 0.6589909251779318, + "reward_before_mean": 0.4826927953399718, + "reward_before_std": 0.6231282472144812, + "reward_change_max": 0.0, + "reward_change_mean": -0.37297336757183075, + "reward_change_min": -0.5635729804635048, + "reward_change_std": 0.2322026826441288, + "reward_std": 0.6589909512549639, + "rewards/accuracy_reward": 0.3125000111758709, + "rewards/cosine_scaled_reward": 0.1701928018592298, + "step": 183 + }, + { + "clip_fraction": 0.0, + "completion_length": 2777.020854949951, + "epoch": 0.2102857142857143, + "grad_norm": 0.027208158746361732, + "kl": 0.00016835331916809082, + "lambda_div_used": 0.5735552906990051, + "learning_rate": 8.195606193320136e-07, + "loss": 0.0331, + "reward": -0.2550749061629176, + "reward_after_mean": -0.2550749061629176, + "reward_after_std": 0.45949564687907696, + "reward_before_mean": 0.09985405765473843, + "reward_before_std": 0.33237360091879964, + "reward_change_max": 0.0, + "reward_change_mean": -0.3549289759248495, + "reward_change_min": -0.4971868433058262, + "reward_change_std": 0.18658464308828115, + "reward_std": 0.45949566550552845, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/cosine_scaled_reward": -0.06681260792538524, + "step": 184 + }, + { + "clip_fraction": 0.0, + "completion_length": 2450.4583473205566, + "epoch": 0.21142857142857144, + "grad_norm": 0.031349100172519684, + "kl": 0.00011152029037475586, + "lambda_div_used": 0.5730742663145065, + "learning_rate": 8.170384989716657e-07, + "loss": 0.055, + "reward": -0.23662783950567245, + "reward_after_mean": -0.23662783950567245, + "reward_after_std": 0.3798432908952236, + "reward_before_mean": 0.08804147504270077, + "reward_before_std": 0.3363419594243169, + "reward_change_max": 0.0, + "reward_change_mean": -0.32466931641101837, + "reward_change_min": -0.47964803501963615, + "reward_change_std": 0.19531975220888853, + "reward_std": 0.3798433095216751, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/cosine_scaled_reward": -0.09945851005613804, + "step": 185 + }, + { + "clip_fraction": 0.0, + "completion_length": 2827.9375228881836, + "epoch": 0.21257142857142858, + "grad_norm": 0.018743637949228287, + "kl": 0.00014129281044006348, + "lambda_div_used": 0.6022143214941025, + "learning_rate": 8.145033635316128e-07, + "loss": -0.0243, + "reward": -0.19847029261291027, + "reward_after_mean": -0.19847029261291027, + "reward_after_std": 0.4997438360005617, + "reward_before_mean": 0.09954999759793282, + "reward_before_std": 0.4699443206191063, + "reward_change_max": 0.0, + "reward_change_mean": -0.298020301386714, + "reward_change_min": -0.4791484698653221, + "reward_change_std": 0.18491498567163944, + "reward_std": 0.49974384531378746, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/cosine_scaled_reward": -0.08794999308884144, + "step": 186 + }, + { + "clip_fraction": 0.0, + "completion_length": 2602.1875228881836, + "epoch": 0.21371428571428572, + "grad_norm": 0.030055196955800056, + "kl": 0.0001709461212158203, + "lambda_div_used": 0.5778695195913315, + "learning_rate": 8.119553365707802e-07, + "loss": -0.0784, + "reward": -0.23846351448446512, + "reward_after_mean": -0.23846351448446512, + "reward_after_std": 0.39364523626863956, + "reward_before_mean": 0.09842715226113796, + "reward_before_std": 0.35397925041615963, + "reward_change_max": 0.0, + "reward_change_mean": -0.3368906620889902, + "reward_change_min": -0.5288374535739422, + "reward_change_std": 0.20237108506262302, + "reward_std": 0.3936452493071556, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/cosine_scaled_reward": -0.047406171914190054, + "step": 187 + }, + { + "clip_fraction": 0.0, + "completion_length": 3423.875030517578, + "epoch": 0.21485714285714286, + "grad_norm": 0.01838378608226776, + "kl": 0.00020372867584228516, + "lambda_div_used": 0.5887190625071526, + "learning_rate": 8.093945422764069e-07, + "loss": 0.0025, + "reward": -0.24391454830765724, + "reward_after_mean": -0.24391454830765724, + "reward_after_std": 0.43992058746516705, + "reward_before_mean": 0.055846452713012695, + "reward_before_std": 0.4102069940418005, + "reward_change_max": 0.0, + "reward_change_mean": -0.2997609917074442, + "reward_change_min": -0.501131433993578, + "reward_change_std": 0.19180170260369778, + "reward_std": 0.4399206154048443, + "rewards/accuracy_reward": 0.125, + "rewards/cosine_scaled_reward": -0.0691535547375679, + "step": 188 + }, + { + "clip_fraction": 0.0, + "completion_length": 1879.500057220459, + "epoch": 0.216, + "grad_norm": 0.034202635288238525, + "kl": 0.00012063980102539062, + "lambda_div_used": 0.6004914790391922, + "learning_rate": 8.068211054579943e-07, + "loss": -0.0391, + "reward": -0.1741858683526516, + "reward_after_mean": -0.1741858683526516, + "reward_after_std": 0.4978427290916443, + "reward_before_mean": 0.13175462279468775, + "reward_before_std": 0.466338312253356, + "reward_change_max": 0.0, + "reward_change_mean": -0.30594046600162983, + "reward_change_min": -0.47848713025450706, + "reward_change_std": 0.1904344316571951, + "reward_std": 0.49784273840487003, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/cosine_scaled_reward": -0.05574538931250572, + "step": 189 + }, + { + "clip_fraction": 0.0, + "completion_length": 2554.9583740234375, + "epoch": 0.21714285714285714, + "grad_norm": 0.02105082757771015, + "kl": 0.0001103430986404419, + "lambda_div_used": 0.6010043099522591, + "learning_rate": 8.04235151541222e-07, + "loss": 0.0273, + "reward": -0.017362398095428944, + "reward_after_mean": -0.017362398095428944, + "reward_after_std": 0.5378628317266703, + "reward_before_mean": 0.3967582155019045, + "reward_before_std": 0.46276178024709225, + "reward_change_max": 0.0, + "reward_change_mean": -0.4141206480562687, + "reward_change_min": -0.6200221106410027, + "reward_change_std": 0.24346179515123367, + "reward_std": 0.5378628373146057, + "rewards/accuracy_reward": 0.31250000558793545, + "rewards/cosine_scaled_reward": 0.08425821363925934, + "step": 190 + }, + { + "clip_fraction": 0.0, + "completion_length": 2059.0208702087402, + "epoch": 0.21828571428571428, + "grad_norm": 0.027044324204325676, + "kl": 0.0001252889633178711, + "lambda_div_used": 0.6293297410011292, + "learning_rate": 8.01636806561836e-07, + "loss": 0.0258, + "reward": 0.11435023881494999, + "reward_after_mean": 0.11435023881494999, + "reward_after_std": 0.6808852814137936, + "reward_before_mean": 0.5306741870008409, + "reward_before_std": 0.5962998084723949, + "reward_change_max": 0.0, + "reward_change_mean": -0.4163239523768425, + "reward_change_min": -0.6401379927992821, + "reward_change_std": 0.2505673002451658, + "reward_std": 0.680885311216116, + "rewards/accuracy_reward": 0.416666679084301, + "rewards/cosine_scaled_reward": 0.11400750931352377, + "step": 191 + }, + { + "clip_fraction": 0.0, + "completion_length": 3052.4583740234375, + "epoch": 0.21942857142857142, + "grad_norm": 0.01929704286158085, + "kl": 0.0001703500747680664, + "lambda_div_used": 0.5668376535177231, + "learning_rate": 7.990261971595048e-07, + "loss": -0.0488, + "reward": -0.18853843957185745, + "reward_after_mean": -0.18853843957185745, + "reward_after_std": 0.3872429598122835, + "reward_before_mean": 0.20527121797204018, + "reward_before_std": 0.30656870268285275, + "reward_change_max": 0.0, + "reward_change_mean": -0.39380968734622, + "reward_change_min": -0.5667809918522835, + "reward_change_std": 0.22394196968525648, + "reward_std": 0.38724296167492867, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/cosine_scaled_reward": -0.0030621085315942764, + "step": 192 + }, + { + "clip_fraction": 0.0, + "completion_length": 2665.0000610351562, + "epoch": 0.22057142857142858, + "grad_norm": 0.024683799594640732, + "kl": 0.0001424849033355713, + "lambda_div_used": 0.6040371954441071, + "learning_rate": 7.964034505716476e-07, + "loss": 0.1051, + "reward": -0.12321672588586807, + "reward_after_mean": -0.12321672588586807, + "reward_after_std": 0.49566064216196537, + "reward_before_mean": 0.19374842569231987, + "reward_before_std": 0.4829600788652897, + "reward_change_max": 0.0, + "reward_change_mean": -0.316965164616704, + "reward_change_min": -0.5347904153168201, + "reward_change_std": 0.2084766924381256, + "reward_std": 0.49566065706312656, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/cosine_scaled_reward": -0.05625157803297043, + "step": 193 + }, + { + "clip_fraction": 0.0, + "completion_length": 2681.6666870117188, + "epoch": 0.22171428571428572, + "grad_norm": 0.01963093690574169, + "kl": 0.0001462697982788086, + "lambda_div_used": 0.6072653383016586, + "learning_rate": 7.93768694627233e-07, + "loss": 0.0685, + "reward": 0.2620888948440552, + "reward_after_mean": 0.2620888948440552, + "reward_after_std": 0.5582387447357178, + "reward_before_mean": 0.7925186604261398, + "reward_before_std": 0.4891574867069721, + "reward_change_max": 0.0, + "reward_change_mean": -0.5304297637194395, + "reward_change_min": -0.7805496528744698, + "reward_change_std": 0.3099254406988621, + "reward_std": 0.5582387670874596, + "rewards/accuracy_reward": 0.541666679084301, + "rewards/cosine_scaled_reward": 0.250851983204484, + "step": 194 + }, + { + "clip_fraction": 0.0, + "completion_length": 2509.2708892822266, + "epoch": 0.22285714285714286, + "grad_norm": 0.021495619788765907, + "kl": 0.00014010071754455566, + "lambda_div_used": 0.6250215768814087, + "learning_rate": 7.911220577405484e-07, + "loss": -0.0159, + "reward": -0.02809133753180504, + "reward_after_mean": -0.02809133753180504, + "reward_after_std": 0.5878969728946686, + "reward_before_mean": 0.3000528886914253, + "reward_before_std": 0.5790729988366365, + "reward_change_max": 0.0, + "reward_change_mean": -0.32814422622323036, + "reward_change_min": -0.5872809514403343, + "reward_change_std": 0.22489875741302967, + "reward_std": 0.5878969803452492, + "rewards/accuracy_reward": 0.2708333395421505, + "rewards/cosine_scaled_reward": 0.029219531919807196, + "step": 195 + }, + { + "clip_fraction": 0.0, + "completion_length": 3391.9166870117188, + "epoch": 0.224, + "grad_norm": 0.01908985897898674, + "kl": 0.00020110607147216797, + "lambda_div_used": 0.5609800890088081, + "learning_rate": 7.884636689049422e-07, + "loss": -0.0096, + "reward": -0.2512575164437294, + "reward_after_mean": -0.2512575164437294, + "reward_after_std": 0.3720796424895525, + "reward_before_mean": 0.114608995616436, + "reward_before_std": 0.27679250249639153, + "reward_change_max": 0.0, + "reward_change_mean": -0.3658665083348751, + "reward_change_min": -0.5327885784208775, + "reward_change_std": 0.20587429776787758, + "reward_std": 0.37207965552806854, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/cosine_scaled_reward": -0.09372434392571449, + "step": 196 + }, + { + "clip_fraction": 0.0, + "completion_length": 2261.50008392334, + "epoch": 0.22514285714285714, + "grad_norm": 0.0338062159717083, + "kl": 0.00014703720808029175, + "lambda_div_used": 0.6676772907376289, + "learning_rate": 7.857936576865356e-07, + "loss": 0.0118, + "reward": 0.3714135689660907, + "reward_after_mean": 0.3714135689660907, + "reward_after_std": 0.7947616018354893, + "reward_before_mean": 0.8371437452733517, + "reward_before_std": 0.7848006598651409, + "reward_change_max": 0.0, + "reward_change_mean": -0.4657301902770996, + "reward_change_min": -0.7909989431500435, + "reward_change_std": 0.31981481425464153, + "reward_std": 0.7947616167366505, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/cosine_scaled_reward": 0.2746437588939443, + "step": 197 + }, + { + "clip_fraction": 0.0, + "completion_length": 2588.0000381469727, + "epoch": 0.22628571428571428, + "grad_norm": 0.02565637417137623, + "kl": 0.00015842914581298828, + "lambda_div_used": 0.5861488357186317, + "learning_rate": 7.831121542179086e-07, + "loss": 0.0107, + "reward": -0.04248452000319958, + "reward_after_mean": -0.04248452000319958, + "reward_after_std": 0.5402837041765451, + "reward_before_mean": 0.3971143513917923, + "reward_before_std": 0.3975243829190731, + "reward_change_max": 0.0, + "reward_change_mean": -0.4395988676697016, + "reward_change_min": -0.6525615304708481, + "reward_change_std": 0.24251667596399784, + "reward_std": 0.5402837190777063, + "rewards/accuracy_reward": 0.33333333395421505, + "rewards/cosine_scaled_reward": 0.06378100253641605, + "step": 198 + }, + { + "clip_fraction": 0.0, + "completion_length": 3565.500030517578, + "epoch": 0.22742857142857142, + "grad_norm": 0.017493488267064095, + "kl": 0.00019216537475585938, + "lambda_div_used": 0.6027035862207413, + "learning_rate": 7.804192891917571e-07, + "loss": 0.0044, + "reward": -0.25367068126797676, + "reward_after_mean": -0.25367068126797676, + "reward_after_std": 0.5018568355590105, + "reward_before_mean": 0.015577135607600212, + "reward_before_std": 0.4789720713160932, + "reward_change_max": 0.0, + "reward_change_mean": -0.2692478112876415, + "reward_change_min": -0.49732755869627, + "reward_change_std": 0.18694379180669785, + "reward_std": 0.501856841146946, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/cosine_scaled_reward": -0.15108953975141048, + "step": 199 + }, + { + "clip_fraction": 0.0, + "completion_length": 2030.7708587646484, + "epoch": 0.22857142857142856, + "grad_norm": 0.023603590205311775, + "kl": 8.529424667358398e-05, + "lambda_div_used": 0.6037929654121399, + "learning_rate": 7.777151938545235e-07, + "loss": 0.0494, + "reward": 0.20463257655501366, + "reward_after_mean": 0.20463257655501366, + "reward_after_std": 0.607479989528656, + "reward_before_mean": 0.7358259493485093, + "reward_before_std": 0.48482801718637347, + "reward_change_max": 0.0, + "reward_change_mean": -0.531193383038044, + "reward_change_min": -0.7928582988679409, + "reward_change_std": 0.3125428520143032, + "reward_std": 0.6074800062924623, + "rewards/accuracy_reward": 0.4583333395421505, + "rewards/cosine_scaled_reward": 0.2774925837293267, + "step": 200 + }, + { + "clip_fraction": 0.0, + "completion_length": 2393.2708740234375, + "epoch": 0.2297142857142857, + "grad_norm": 0.030843405053019524, + "kl": 0.00012966245412826538, + "lambda_div_used": 0.6355544030666351, + "learning_rate": 7.75e-07, + "loss": -0.0226, + "reward": 0.36713023856282234, + "reward_after_mean": 0.36713023856282234, + "reward_after_std": 0.7189916651695967, + "reward_before_mean": 0.8961770609021187, + "reward_before_std": 0.6416445402428508, + "reward_change_max": 0.0, + "reward_change_mean": -0.5290468074381351, + "reward_change_min": -0.790231991559267, + "reward_change_std": 0.33059168234467506, + "reward_std": 0.7189916893839836, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/cosine_scaled_reward": 0.3128436878323555, + "step": 201 + }, + { + "clip_fraction": 0.0, + "completion_length": 2136.437526702881, + "epoch": 0.23085714285714284, + "grad_norm": 0.02494831755757332, + "kl": 9.03918407857418e-05, + "lambda_div_used": 0.5813711285591125, + "learning_rate": 7.72273839962904e-07, + "loss": -0.0212, + "reward": 0.20651111379265785, + "reward_after_mean": 0.20651111379265785, + "reward_after_std": 0.5735384915024042, + "reward_before_mean": 0.8149078581482172, + "reward_before_std": 0.3815823132172227, + "reward_change_max": 0.0, + "reward_change_mean": -0.6083967536687851, + "reward_change_min": -0.8435066714882851, + "reward_change_std": 0.3385826703161001, + "reward_std": 0.5735384933650494, + "rewards/accuracy_reward": 0.5208333358168602, + "rewards/cosine_scaled_reward": 0.2940745260566473, + "step": 202 + }, + { + "clip_fraction": 0.0, + "completion_length": 3241.125030517578, + "epoch": 0.232, + "grad_norm": 0.019406091421842575, + "kl": 0.00020945072174072266, + "lambda_div_used": 0.5796335637569427, + "learning_rate": 7.695368466124296e-07, + "loss": 0.0001, + "reward": -0.171187374740839, + "reward_after_mean": -0.171187374740839, + "reward_after_std": 0.4753460921347141, + "reward_before_mean": 0.21428256388753653, + "reward_before_std": 0.3680141428485513, + "reward_change_max": 0.0, + "reward_change_mean": -0.38546993769705296, + "reward_change_min": -0.565359104424715, + "reward_change_std": 0.21748895198106766, + "reward_std": 0.4753460939973593, + "rewards/accuracy_reward": 0.20833333395421505, + "rewards/cosine_scaled_reward": 0.0059492262080311775, + "step": 203 + }, + { + "clip_fraction": 0.0, + "completion_length": 1740.6458892822266, + "epoch": 0.23314285714285715, + "grad_norm": 0.0317753441631794, + "kl": 0.00010880827903747559, + "lambda_div_used": 0.6080649197101593, + "learning_rate": 7.667891533457718e-07, + "loss": 0.1061, + "reward": 0.01815126556903124, + "reward_after_mean": 0.01815126556903124, + "reward_after_std": 0.5492583587765694, + "reward_before_mean": 0.41005287505686283, + "reward_before_std": 0.4979597805067897, + "reward_change_max": 0.0, + "reward_change_mean": -0.39190160669386387, + "reward_change_min": -0.5888865925371647, + "reward_change_std": 0.23326328117400408, + "reward_std": 0.5492583997547626, + "rewards/accuracy_reward": 0.31250000558793545, + "rewards/cosine_scaled_reward": 0.09755285223945975, + "step": 204 + }, + { + "clip_fraction": 0.0, + "completion_length": 2370.875068664551, + "epoch": 0.2342857142857143, + "grad_norm": 0.030042720958590508, + "kl": 0.00014892220497131348, + "lambda_div_used": 0.6162895858287811, + "learning_rate": 7.640308940816239e-07, + "loss": 0.1316, + "reward": 0.43424203619360924, + "reward_after_mean": 0.43424203619360924, + "reward_after_std": 0.6284053698182106, + "reward_before_mean": 1.0446599274873734, + "reward_before_std": 0.5331263300031424, + "reward_change_max": 0.0, + "reward_change_mean": -0.610417865216732, + "reward_change_min": -0.8885884135961533, + "reward_change_std": 0.35387465916574, + "reward_std": 0.6284053847193718, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/cosine_scaled_reward": 0.41965989768505096, + "step": 205 + }, + { + "clip_fraction": 0.0, + "completion_length": 2687.0208587646484, + "epoch": 0.23542857142857143, + "grad_norm": 0.021548230201005936, + "kl": 0.00012874603271484375, + "lambda_div_used": 0.6292116791009903, + "learning_rate": 7.612622032536507e-07, + "loss": 0.0002, + "reward": -0.19604980573058128, + "reward_after_mean": -0.19604980573058128, + "reward_after_std": 0.6235801223665476, + "reward_before_mean": 0.05089604668319225, + "reward_before_std": 0.606025786139071, + "reward_change_max": 0.0, + "reward_change_mean": -0.24694585241377354, + "reward_change_min": -0.5342049337923527, + "reward_change_std": 0.1862892871722579, + "reward_std": 0.623580127954483, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/cosine_scaled_reward": -0.1157706193625927, + "step": 206 + }, + { + "clip_fraction": 0.0, + "completion_length": 2933.145866394043, + "epoch": 0.23657142857142857, + "grad_norm": 0.02744130790233612, + "kl": 0.0001799464225769043, + "lambda_div_used": 0.5820295214653015, + "learning_rate": 7.584832158039378e-07, + "loss": -0.0561, + "reward": -0.27963498421013355, + "reward_after_mean": -0.27963498421013355, + "reward_after_std": 0.4231335464864969, + "reward_before_mean": 0.013983679935336113, + "reward_before_std": 0.377011489123106, + "reward_change_max": 0.0, + "reward_change_mean": -0.2936186585575342, + "reward_change_min": -0.4932614788413048, + "reward_change_std": 0.18361396715044975, + "reward_std": 0.423133572563529, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/cosine_scaled_reward": -0.11101631447672844, + "step": 207 + }, + { + "clip_fraction": 0.0, + "completion_length": 2700.4583587646484, + "epoch": 0.2377142857142857, + "grad_norm": 0.022379335016012192, + "kl": 0.00015173852443695068, + "lambda_div_used": 0.6108261719346046, + "learning_rate": 7.556940671764124e-07, + "loss": 0.0624, + "reward": -0.02188705001026392, + "reward_after_mean": -0.02188705001026392, + "reward_after_std": 0.5500882770866156, + "reward_before_mean": 0.3470336627215147, + "reward_before_std": 0.5068763047456741, + "reward_change_max": 0.0, + "reward_change_mean": -0.3689207211136818, + "reward_change_min": -0.5766280777752399, + "reward_change_std": 0.22263448685407639, + "reward_std": 0.550088282674551, + "rewards/accuracy_reward": 0.27083334513008595, + "rewards/cosine_scaled_reward": 0.07620031712576747, + "step": 208 + }, + { + "clip_fraction": 0.0, + "completion_length": 2061.041732788086, + "epoch": 0.23885714285714285, + "grad_norm": 0.029824599623680115, + "kl": 9.86829400062561e-05, + "lambda_div_used": 0.6090070083737373, + "learning_rate": 7.528948933102438e-07, + "loss": -0.0351, + "reward": -0.05778682604432106, + "reward_after_mean": -0.05778682604432106, + "reward_after_std": 0.5228247437626123, + "reward_before_mean": 0.2826935350894928, + "reward_before_std": 0.501367649412714, + "reward_change_max": 0.0, + "reward_change_mean": -0.3404803555458784, + "reward_change_min": -0.5473614186048508, + "reward_change_std": 0.21792252641171217, + "reward_std": 0.5228247474879026, + "rewards/accuracy_reward": 0.3125000111758709, + "rewards/cosine_scaled_reward": -0.029806464910507202, + "step": 209 + }, + { + "clip_fraction": 0.0, + "completion_length": 2671.9583740234375, + "epoch": 0.24, + "grad_norm": 0.019723398610949516, + "kl": 0.0001126602292060852, + "lambda_div_used": 0.6178258955478668, + "learning_rate": 7.500858306332172e-07, + "loss": -0.046, + "reward": -0.019846799783408642, + "reward_after_mean": -0.019846799783408642, + "reward_after_std": 0.6330938600003719, + "reward_before_mean": 0.3628573752939701, + "reward_before_std": 0.5453151864930987, + "reward_change_max": 0.0, + "reward_change_mean": -0.38270418159663677, + "reward_change_min": -0.5824633538722992, + "reward_change_std": 0.22786249686032534, + "reward_std": 0.6330938655883074, + "rewards/accuracy_reward": 0.3125000037252903, + "rewards/cosine_scaled_reward": 0.05035736900754273, + "step": 210 + }, + { + "clip_fraction": 0.0, + "completion_length": 2191.2500076293945, + "epoch": 0.24114285714285713, + "grad_norm": 0.026132306084036827, + "kl": 0.00012756884098052979, + "lambda_div_used": 0.5773908644914627, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0368, + "reward": -0.09260139870457351, + "reward_after_mean": -0.09260139870457351, + "reward_after_std": 0.5260053481906652, + "reward_before_mean": 0.3510722735663876, + "reward_before_std": 0.35058190673589706, + "reward_change_max": 0.0, + "reward_change_mean": -0.4436736721545458, + "reward_change_min": -0.5985335633158684, + "reward_change_std": 0.22750128898769617, + "reward_std": 0.5260053630918264, + "rewards/accuracy_reward": 0.2916666679084301, + "rewards/cosine_scaled_reward": 0.05940559017471969, + "step": 211 + }, + { + "clip_fraction": 0.0, + "completion_length": 1899.8750457763672, + "epoch": 0.2422857142857143, + "grad_norm": 0.03510681912302971, + "kl": 0.000111408531665802, + "lambda_div_used": 0.6011399254202843, + "learning_rate": 7.444385869608921e-07, + "loss": 0.0147, + "reward": 0.018216299824416637, + "reward_after_mean": 0.018216299824416637, + "reward_after_std": 0.5794783290475607, + "reward_before_mean": 0.4559548683464527, + "reward_before_std": 0.47238516760990024, + "reward_change_max": 0.0, + "reward_change_mean": -0.43773859925568104, + "reward_change_min": -0.6772446036338806, + "reward_change_std": 0.26400116458535194, + "reward_std": 0.5794783346354961, + "rewards/accuracy_reward": 0.4375000074505806, + "rewards/cosine_scaled_reward": 0.018454871140420437, + "step": 212 + }, + { + "clip_fraction": 0.0, + "completion_length": 2187.2708587646484, + "epoch": 0.24342857142857144, + "grad_norm": 0.02723986841738224, + "kl": 0.00015923380851745605, + "lambda_div_used": 0.6033707112073898, + "learning_rate": 7.416006812042827e-07, + "loss": 0.0702, + "reward": 0.11780917271971703, + "reward_after_mean": 0.11780917271971703, + "reward_after_std": 0.6044025905430317, + "reward_before_mean": 0.6168809719383717, + "reward_before_std": 0.4788802685216069, + "reward_change_max": 0.0, + "reward_change_mean": -0.4990718085318804, + "reward_change_min": -0.774650864303112, + "reward_change_std": 0.2952164225280285, + "reward_std": 0.6044025998562574, + "rewards/accuracy_reward": 0.4375000037252903, + "rewards/cosine_scaled_reward": 0.17938095517456532, + "step": 213 + }, + { + "clip_fraction": 0.0, + "completion_length": 2442.375072479248, + "epoch": 0.24457142857142858, + "grad_norm": 0.024862557649612427, + "kl": 0.00014431774616241455, + "lambda_div_used": 0.5444722771644592, + "learning_rate": 7.387534371007797e-07, + "loss": -0.0237, + "reward": -0.20888542756438255, + "reward_after_mean": -0.20888542756438255, + "reward_after_std": 0.3397774752229452, + "reward_before_mean": 0.22809876408427954, + "reward_before_std": 0.19605009350925684, + "reward_change_max": 0.0, + "reward_change_mean": -0.436984209343791, + "reward_change_min": -0.5989037677645683, + "reward_change_std": 0.22710295487195253, + "reward_std": 0.339777497574687, + "rewards/accuracy_reward": 0.25, + "rewards/cosine_scaled_reward": -0.021901232190430164, + "step": 214 + }, + { + "clip_fraction": 0.0, + "completion_length": 1974.0000457763672, + "epoch": 0.24571428571428572, + "grad_norm": 0.03193683549761772, + "kl": 0.00011608004570007324, + "lambda_div_used": 0.6048371568322182, + "learning_rate": 7.358969934210438e-07, + "loss": -0.0834, + "reward": -0.23377530556172132, + "reward_after_mean": -0.23377530556172132, + "reward_after_std": 0.5205546151846647, + "reward_before_mean": 0.02766125090420246, + "reward_before_std": 0.48529171757400036, + "reward_change_max": 0.0, + "reward_change_mean": -0.2614365555346012, + "reward_change_min": -0.44492336362600327, + "reward_change_std": 0.16870077326893806, + "reward_std": 0.5205546207726002, + "rewards/accuracy_reward": 0.14583333767950535, + "rewards/cosine_scaled_reward": -0.11817209050059319, + "step": 215 + }, + { + "clip_fraction": 0.0, + "completion_length": 2073.7917137145996, + "epoch": 0.24685714285714286, + "grad_norm": 0.03090011700987816, + "kl": 0.00015020370483398438, + "lambda_div_used": 0.5987100675702095, + "learning_rate": 7.330314893841101e-07, + "loss": -0.037, + "reward": -0.0010385997593402863, + "reward_after_mean": -0.0010385997593402863, + "reward_after_std": 0.5259138215333223, + "reward_before_mean": 0.42963695898652077, + "reward_before_std": 0.4547121487557888, + "reward_change_max": 0.0, + "reward_change_mean": -0.43067559227347374, + "reward_change_min": -0.6873270347714424, + "reward_change_std": 0.2589325439184904, + "reward_std": 0.5259138215333223, + "rewards/accuracy_reward": 0.3541666679084301, + "rewards/cosine_scaled_reward": 0.07547031342983246, + "step": 216 + }, + { + "clip_fraction": 0.0, + "completion_length": 2313.416679382324, + "epoch": 0.248, + "grad_norm": 0.02645043283700943, + "kl": 0.00013341009616851807, + "lambda_div_used": 0.6059171706438065, + "learning_rate": 7.301570646506027e-07, + "loss": 0.005, + "reward": 0.06280752643942833, + "reward_after_mean": 0.06280752643942833, + "reward_after_std": 0.5530473850667477, + "reward_before_mean": 0.4911847524344921, + "reward_before_std": 0.4922928689047694, + "reward_change_max": 0.0, + "reward_change_mean": -0.42837722785770893, + "reward_change_min": -0.6544801071286201, + "reward_change_std": 0.2630965141579509, + "reward_std": 0.5530474036931992, + "rewards/accuracy_reward": 0.3750000074505806, + "rewards/cosine_scaled_reward": 0.11618476174771786, + "step": 217 + }, + { + "clip_fraction": 0.0, + "completion_length": 2668.9375381469727, + "epoch": 0.24914285714285714, + "grad_norm": 0.023471172899007797, + "kl": 0.00011789798736572266, + "lambda_div_used": 0.6332497969269753, + "learning_rate": 7.27273859315928e-07, + "loss": -0.0061, + "reward": 0.06328068673610687, + "reward_after_mean": 0.06328068673610687, + "reward_after_std": 0.631486464291811, + "reward_before_mean": 0.4275508373975754, + "reward_before_std": 0.613295029848814, + "reward_change_max": 0.0, + "reward_change_mean": -0.3642701506614685, + "reward_change_min": -0.593178354203701, + "reward_change_std": 0.23565197084099054, + "reward_std": 0.6314864810556173, + "rewards/accuracy_reward": 0.33333334513008595, + "rewards/cosine_scaled_reward": 0.09421749995090067, + "step": 218 + }, + { + "clip_fraction": 0.0, + "completion_length": 2036.7292098999023, + "epoch": 0.2502857142857143, + "grad_norm": 0.031107638031244278, + "kl": 0.00014078617095947266, + "lambda_div_used": 0.6064230278134346, + "learning_rate": 7.243820139034464e-07, + "loss": 0.0536, + "reward": 0.17783035337924957, + "reward_after_mean": 0.17783035337924957, + "reward_after_std": 0.6028466131538153, + "reward_before_mean": 0.6854837816208601, + "reward_before_std": 0.4974929317831993, + "reward_change_max": 0.0, + "reward_change_mean": -0.5076534673571587, + "reward_change_min": -0.7541679181158543, + "reward_change_std": 0.3024881314486265, + "reward_std": 0.602846622467041, + "rewards/accuracy_reward": 0.4583333358168602, + "rewards/cosine_scaled_reward": 0.22715043649077415, + "step": 219 + }, + { + "clip_fraction": 0.0, + "completion_length": 2585.562515258789, + "epoch": 0.25142857142857145, + "grad_norm": 0.033569689840078354, + "kl": 0.00014656782150268555, + "lambda_div_used": 0.5626775473356247, + "learning_rate": 7.214816693576234e-07, + "loss": 0.0096, + "reward": -0.44609223771840334, + "reward_after_mean": -0.44609223771840334, + "reward_after_std": 0.34080066718161106, + "reward_before_mean": -0.19077827036380768, + "reward_before_std": 0.2862963704392314, + "reward_change_max": 0.0, + "reward_change_mean": -0.25531397201120853, + "reward_change_min": -0.4116707444190979, + "reward_change_std": 0.15291727520525455, + "reward_std": 0.340800691395998, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/cosine_scaled_reward": -0.23244493766105734, + "step": 220 + }, + { + "clip_fraction": 0.0, + "completion_length": 1835.1667137145996, + "epoch": 0.25257142857142856, + "grad_norm": 0.026043305173516273, + "kl": 9.073130786418915e-05, + "lambda_div_used": 0.589106909930706, + "learning_rate": 7.185729670371604e-07, + "loss": -0.0167, + "reward": 0.1618174184113741, + "reward_after_mean": 0.1618174184113741, + "reward_after_std": 0.5352318156510592, + "reward_before_mean": 0.7099000085145235, + "reward_before_std": 0.40553835732862353, + "reward_change_max": 0.0, + "reward_change_mean": -0.5480826254934072, + "reward_change_min": -0.7475878298282623, + "reward_change_std": 0.3015699228271842, + "reward_std": 0.5352318380028009, + "rewards/accuracy_reward": 0.47916667722165585, + "rewards/cosine_scaled_reward": 0.23073333408683538, + "step": 221 + }, + { + "clip_fraction": 0.0, + "completion_length": 2023.6875305175781, + "epoch": 0.2537142857142857, + "grad_norm": 0.024540327489376068, + "kl": 0.0001620650291442871, + "lambda_div_used": 0.5971302166581154, + "learning_rate": 7.156560487081051e-07, + "loss": 0.0082, + "reward": 0.04625087231397629, + "reward_after_mean": 0.04625087231397629, + "reward_after_std": 0.5205375533550978, + "reward_before_mean": 0.4992250055074692, + "reward_before_std": 0.4442645478993654, + "reward_change_max": 0.0, + "reward_change_mean": -0.45297410897910595, + "reward_change_min": -0.7014825120568275, + "reward_change_std": 0.27082843892276287, + "reward_std": 0.5205375626683235, + "rewards/accuracy_reward": 0.3541666753590107, + "rewards/cosine_scaled_reward": 0.145058311522007, + "step": 222 + }, + { + "clip_fraction": 0.0, + "completion_length": 2102.104202270508, + "epoch": 0.25485714285714284, + "grad_norm": 0.02421215921640396, + "kl": 0.00010086596012115479, + "lambda_div_used": 0.6135998442769051, + "learning_rate": 7.127310565369415e-07, + "loss": 0.0077, + "reward": -0.07060145400464535, + "reward_after_mean": -0.07060145400464535, + "reward_after_std": 0.5450965594500303, + "reward_before_mean": 0.2666409400990233, + "reward_before_std": 0.5180736510083079, + "reward_change_max": 0.0, + "reward_change_mean": -0.33724240958690643, + "reward_change_min": -0.5495268329977989, + "reward_change_std": 0.2100124368444085, + "reward_std": 0.5450965687632561, + "rewards/accuracy_reward": 0.27083334513008595, + "rewards/cosine_scaled_reward": -0.004192400723695755, + "step": 223 + }, + { + "clip_fraction": 0.0, + "completion_length": 2678.8958892822266, + "epoch": 0.256, + "grad_norm": 0.0209470484405756, + "kl": 0.00014913082122802734, + "lambda_div_used": 0.6316534802317619, + "learning_rate": 7.097981330836616e-07, + "loss": 0.041, + "reward": 0.002024895278736949, + "reward_after_mean": 0.002024895278736949, + "reward_after_std": 0.6682235784828663, + "reward_before_mean": 0.36611822061240673, + "reward_before_std": 0.6153735313564539, + "reward_change_max": 0.0, + "reward_change_mean": -0.36409333534538746, + "reward_change_min": -0.6180168017745018, + "reward_change_std": 0.23850849829614162, + "reward_std": 0.6682236194610596, + "rewards/accuracy_reward": 0.33333333767950535, + "rewards/cosine_scaled_reward": 0.03278488974319771, + "step": 224 + }, + { + "clip_fraction": 0.0, + "completion_length": 2463.6875610351562, + "epoch": 0.2571428571428571, + "grad_norm": 0.026948727667331696, + "kl": 0.0001347959041595459, + "lambda_div_used": 0.6428607329726219, + "learning_rate": 7.068574212948169e-07, + "loss": 0.0403, + "reward": 0.05994867905974388, + "reward_after_mean": 0.05994867905974388, + "reward_after_std": 0.6384792737662792, + "reward_before_mean": 0.3915696498006582, + "reward_before_std": 0.6680623888969421, + "reward_change_max": 0.0, + "reward_change_mean": -0.3316209614276886, + "reward_change_min": -0.606514610350132, + "reward_change_std": 0.24293010961264372, + "reward_std": 0.638479296118021, + "rewards/accuracy_reward": 0.3541666753590107, + "rewards/cosine_scaled_reward": 0.03740297071635723, + "step": 225 + }, + { + "clip_fraction": 0.0, + "completion_length": 2275.479232788086, + "epoch": 0.2582857142857143, + "grad_norm": 0.0262776929885149, + "kl": 0.00012694299221038818, + "lambda_div_used": 0.6046253740787506, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0006, + "reward": 0.08375599328428507, + "reward_after_mean": 0.08375599328428507, + "reward_after_std": 0.5798605680465698, + "reward_before_mean": 0.5400239741429687, + "reward_before_std": 0.47723895218223333, + "reward_change_max": 0.0, + "reward_change_mean": -0.45626799017190933, + "reward_change_min": -0.6333699934184551, + "reward_change_std": 0.2536289654672146, + "reward_std": 0.579860582947731, + "rewards/accuracy_reward": 0.3750000111758709, + "rewards/cosine_scaled_reward": 0.16502396669238806, + "step": 226 + }, + { + "clip_fraction": 0.0, + "completion_length": 1853.1041793823242, + "epoch": 0.25942857142857145, + "grad_norm": 0.03546634316444397, + "kl": 0.00011576712131500244, + "lambda_div_used": 0.6238459944725037, + "learning_rate": 7.009532063876148e-07, + "loss": -0.0356, + "reward": 0.035207513719797134, + "reward_after_mean": 0.035207513719797134, + "reward_after_std": 0.5671821534633636, + "reward_before_mean": 0.38889277167618275, + "reward_before_std": 0.5772030726075172, + "reward_change_max": 0.0, + "reward_change_mean": -0.35368524491786957, + "reward_change_min": -0.587718054652214, + "reward_change_std": 0.2398481909185648, + "reward_std": 0.5671821553260088, + "rewards/accuracy_reward": 0.3125000074505806, + "rewards/cosine_scaled_reward": 0.07639275304973125, + "step": 227 + }, + { + "clip_fraction": 0.0, + "completion_length": 2130.875026702881, + "epoch": 0.26057142857142856, + "grad_norm": 0.03324354812502861, + "kl": 0.00012753158807754517, + "lambda_div_used": 0.566599652171135, + "learning_rate": 6.979899910323624e-07, + "loss": -0.0669, + "reward": 0.011986830271780491, + "reward_after_mean": 0.011986830271780491, + "reward_after_std": 0.4835386872291565, + "reward_before_mean": 0.5381738739088178, + "reward_before_std": 0.30367479752749205, + "reward_change_max": 0.0, + "reward_change_mean": -0.5261870250105858, + "reward_change_min": -0.6992091946303844, + "reward_change_std": 0.27626297529786825, + "reward_std": 0.48353871516883373, + "rewards/accuracy_reward": 0.37500000558793545, + "rewards/cosine_scaled_reward": 0.1631738357245922, + "step": 228 + }, + { + "clip_fraction": 0.0, + "completion_length": 3159.562530517578, + "epoch": 0.26171428571428573, + "grad_norm": 0.026536332443356514, + "kl": 0.00018978118896484375, + "lambda_div_used": 0.5688631013035774, + "learning_rate": 6.950195628537299e-07, + "loss": -0.0295, + "reward": -0.11544675379991531, + "reward_after_mean": -0.11544675379991531, + "reward_after_std": 0.42418220825493336, + "reward_before_mean": 0.31868776679039, + "reward_before_std": 0.3126910990104079, + "reward_change_max": 0.0, + "reward_change_mean": -0.4341345224529505, + "reward_change_min": -0.63496870175004, + "reward_change_std": 0.24463962391018867, + "reward_std": 0.42418221198022366, + "rewards/accuracy_reward": 0.2708333395421505, + "rewards/cosine_scaled_reward": 0.047854430973529816, + "step": 229 + }, + { + "clip_fraction": 0.0, + "completion_length": 2753.3333740234375, + "epoch": 0.26285714285714284, + "grad_norm": 0.020300425589084625, + "kl": 0.00014778971672058105, + "lambda_div_used": 0.6197129040956497, + "learning_rate": 6.920420666261961e-07, + "loss": 0.0738, + "reward": -0.06457636877894402, + "reward_after_mean": -0.06457636877894402, + "reward_after_std": 0.5870498064905405, + "reward_before_mean": 0.2789040170609951, + "reward_before_std": 0.5545760486274958, + "reward_change_max": 0.0, + "reward_change_mean": -0.34348038397729397, + "reward_change_min": -0.6167686618864536, + "reward_change_std": 0.22966008260846138, + "reward_std": 0.5870498213917017, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/cosine_scaled_reward": 0.008070665411651134, + "step": 230 + }, + { + "clip_fraction": 0.0, + "completion_length": 2388.3750610351562, + "epoch": 0.264, + "grad_norm": 0.025614017620682716, + "kl": 0.00012599676847457886, + "lambda_div_used": 0.6009133085608482, + "learning_rate": 6.890576474687263e-07, + "loss": -0.0117, + "reward": -0.07801849395036697, + "reward_after_mean": -0.07801849395036697, + "reward_after_std": 0.5619381573051214, + "reward_before_mean": 0.2970875895989593, + "reward_before_std": 0.4653975451365113, + "reward_change_max": 0.0, + "reward_change_mean": -0.37510609440505505, + "reward_change_min": -0.5894374549388885, + "reward_change_std": 0.2189607135951519, + "reward_std": 0.5619381796568632, + "rewards/accuracy_reward": 0.27083333767950535, + "rewards/cosine_scaled_reward": 0.02625426184386015, + "step": 231 + }, + { + "clip_fraction": 0.0, + "completion_length": 2800.4792098999023, + "epoch": 0.2651428571428571, + "grad_norm": 0.020755581557750702, + "kl": 0.00017752498388290405, + "lambda_div_used": 0.6359871402382851, + "learning_rate": 6.860664508377001e-07, + "loss": 0.0213, + "reward": 0.09202059358358383, + "reward_after_mean": 0.09202059358358383, + "reward_after_std": 0.6515896432101727, + "reward_before_mean": 0.49167851358652115, + "reward_before_std": 0.6335036922246218, + "reward_change_max": 0.0, + "reward_change_mean": -0.3996579386293888, + "reward_change_min": -0.6971911080181599, + "reward_change_std": 0.2742554973810911, + "reward_std": 0.6515896506607533, + "rewards/accuracy_reward": 0.3958333358168602, + "rewards/cosine_scaled_reward": 0.09584518847987056, + "step": 232 + }, + { + "clip_fraction": 0.0, + "completion_length": 1828.020866394043, + "epoch": 0.2662857142857143, + "grad_norm": 0.028299635276198387, + "kl": 0.00010451674461364746, + "lambda_div_used": 0.6347432807087898, + "learning_rate": 6.83068622519821e-07, + "loss": -0.0488, + "reward": -0.11232293955981731, + "reward_after_mean": -0.11232293955981731, + "reward_after_std": 0.6607348509132862, + "reward_before_mean": 0.16373980697244406, + "reward_before_std": 0.6230235639959574, + "reward_change_max": 0.0, + "reward_change_mean": -0.2760627530515194, + "reward_change_min": -0.5052222050726414, + "reward_change_std": 0.18505325820297003, + "reward_std": 0.660734860226512, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/cosine_scaled_reward": -0.06542686396278441, + "step": 233 + }, + { + "clip_fraction": 0.0, + "completion_length": 2573.562526702881, + "epoch": 0.2674285714285714, + "grad_norm": 0.027019290253520012, + "kl": 0.0001166127622127533, + "lambda_div_used": 0.5475329235196114, + "learning_rate": 6.800643086250121e-07, + "loss": 0.0132, + "reward": -0.23357034847140312, + "reward_after_mean": -0.23357034847140312, + "reward_after_std": 0.3406812082976103, + "reward_before_mean": 0.17690101824700832, + "reward_before_std": 0.21493587270379066, + "reward_change_max": 0.0, + "reward_change_mean": -0.4104713797569275, + "reward_change_min": -0.5621155239641666, + "reward_change_std": 0.22167872916907072, + "reward_std": 0.34068121016025543, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/cosine_scaled_reward": -0.03143232688307762, + "step": 234 + }, + { + "clip_fraction": 0.0, + "completion_length": 2243.645851135254, + "epoch": 0.26857142857142857, + "grad_norm": 0.031080337241292, + "kl": 0.00015339255332946777, + "lambda_div_used": 0.6331789866089821, + "learning_rate": 6.770536555792944e-07, + "loss": -0.0167, + "reward": 0.05447516264393926, + "reward_after_mean": 0.05447516264393926, + "reward_after_std": 0.7066546399146318, + "reward_before_mean": 0.44694859720766544, + "reward_before_std": 0.6171103774104267, + "reward_change_max": 0.0, + "reward_change_mean": -0.39247346110641956, + "reward_change_min": -0.6285405829548836, + "reward_change_std": 0.23825406469404697, + "reward_std": 0.706654641777277, + "rewards/accuracy_reward": 0.3541666753590107, + "rewards/cosine_scaled_reward": 0.09278193739010021, + "step": 235 + }, + { + "clip_fraction": 0.0, + "completion_length": 2630.875030517578, + "epoch": 0.26971428571428574, + "grad_norm": 0.022363808006048203, + "kl": 0.00015428662300109863, + "lambda_div_used": 0.6662941351532936, + "learning_rate": 6.740368101176495e-07, + "loss": 0.026, + "reward": 0.1967709083110094, + "reward_after_mean": 0.1967709083110094, + "reward_after_std": 0.7583610694855452, + "reward_before_mean": 0.5491750640794635, + "reward_before_std": 0.7768743745982647, + "reward_change_max": 0.0, + "reward_change_mean": -0.35240414179861546, + "reward_change_min": -0.625398077070713, + "reward_change_std": 0.25161405000835657, + "reward_std": 0.758361091837287, + "rewards/accuracy_reward": 0.39583334513008595, + "rewards/cosine_scaled_reward": 0.15334171522408724, + "step": 236 + }, + { + "clip_fraction": 0.0, + "completion_length": 2439.7292289733887, + "epoch": 0.27085714285714285, + "grad_norm": 0.02510235831141472, + "kl": 0.00016179680824279785, + "lambda_div_used": 0.6136586889624596, + "learning_rate": 6.710139192768694e-07, + "loss": -0.0166, + "reward": 0.04647237854078412, + "reward_after_mean": 0.04647237854078412, + "reward_after_std": 0.6743428651243448, + "reward_before_mean": 0.47922211419790983, + "reward_before_std": 0.5223582116886973, + "reward_change_max": 0.0, + "reward_change_mean": -0.43274970538914204, + "reward_change_min": -0.6191227361559868, + "reward_change_std": 0.23846820835024118, + "reward_std": 0.6743428837507963, + "rewards/accuracy_reward": 0.37500000186264515, + "rewards/cosine_scaled_reward": 0.10422207851661369, + "step": 237 + }, + { + "clip_fraction": 0.0, + "completion_length": 2402.5000762939453, + "epoch": 0.272, + "grad_norm": 0.021976694464683533, + "kl": 0.0001609325408935547, + "lambda_div_used": 0.6034338474273682, + "learning_rate": 6.679851303883891e-07, + "loss": 0.0784, + "reward": 0.18511426215991378, + "reward_after_mean": 0.18511426215991378, + "reward_after_std": 0.6621626690030098, + "reward_before_mean": 0.7374962608737405, + "reward_before_std": 0.4762335177510977, + "reward_change_max": 0.0, + "reward_change_mean": -0.5523819867521524, + "reward_change_min": -0.7993261553347111, + "reward_change_std": 0.3063361942768097, + "reward_std": 0.6621626764535904, + "rewards/accuracy_reward": 0.5000000018626451, + "rewards/cosine_scaled_reward": 0.23749624891206622, + "step": 238 + }, + { + "clip_fraction": 0.0, + "completion_length": 1732.2292098999023, + "epoch": 0.27314285714285713, + "grad_norm": 0.03150353208184242, + "kl": 7.70464539527893e-05, + "lambda_div_used": 0.6118984445929527, + "learning_rate": 6.649505910711058e-07, + "loss": 0.0245, + "reward": 0.22464729472994804, + "reward_after_mean": 0.22464729472994804, + "reward_after_std": 0.5859156623482704, + "reward_before_mean": 0.7294908128678799, + "reward_before_std": 0.5119684813544154, + "reward_change_max": 0.0, + "reward_change_mean": -0.5048435050994158, + "reward_change_min": -0.7602570950984955, + "reward_change_std": 0.30051624588668346, + "reward_std": 0.5859156772494316, + "rewards/accuracy_reward": 0.4583333469927311, + "rewards/cosine_scaled_reward": 0.27115743793547153, + "step": 239 + }, + { + "clip_fraction": 0.0, + "completion_length": 2970.708366394043, + "epoch": 0.2742857142857143, + "grad_norm": 0.02445312589406967, + "kl": 0.00020241737365722656, + "lambda_div_used": 0.5594572946429253, + "learning_rate": 6.619104492241847e-07, + "loss": 0.0141, + "reward": -0.36844223737716675, + "reward_after_mean": -0.36844223737716675, + "reward_after_std": 0.32861490175127983, + "reward_before_mean": -0.06151419784873724, + "reward_before_std": 0.2699447488412261, + "reward_change_max": 0.0, + "reward_change_mean": -0.30692804232239723, + "reward_change_min": -0.46535007655620575, + "reward_change_std": 0.1784888058900833, + "reward_std": 0.3286149147897959, + "rewards/accuracy_reward": 0.0625, + "rewards/cosine_scaled_reward": -0.12401420716196299, + "step": 240 + }, + { + "clip_fraction": 0.0, + "completion_length": 2815.041702270508, + "epoch": 0.2754285714285714, + "grad_norm": 0.02061399444937706, + "kl": 0.0001932680606842041, + "lambda_div_used": 0.5598616823554039, + "learning_rate": 6.588648530198504e-07, + "loss": 0.0225, + "reward": -0.4072608258575201, + "reward_after_mean": -0.4072608258575201, + "reward_after_std": 0.34731264412403107, + "reward_before_mean": -0.12633821368217468, + "reward_before_std": 0.2754332982003689, + "reward_change_max": 0.0, + "reward_change_mean": -0.2809226084500551, + "reward_change_min": -0.4441990442574024, + "reward_change_std": 0.1635214313864708, + "reward_std": 0.34731266647577286, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/cosine_scaled_reward": -0.16800487786531448, + "step": 241 + }, + { + "clip_fraction": 0.0, + "completion_length": 2020.520851135254, + "epoch": 0.2765714285714286, + "grad_norm": 0.03268786519765854, + "kl": 0.0002362281084060669, + "lambda_div_used": 0.5827708318829536, + "learning_rate": 6.558139508961654e-07, + "loss": 0.0641, + "reward": -0.14442800264805555, + "reward_after_mean": -0.14442800264805555, + "reward_after_std": 0.488038569688797, + "reward_before_mean": 0.23860891722142696, + "reward_before_std": 0.3739425097592175, + "reward_change_max": 0.0, + "reward_change_mean": -0.38303691893815994, + "reward_change_min": -0.5426856316626072, + "reward_change_std": 0.20754980947822332, + "reward_std": 0.4880385845899582, + "rewards/accuracy_reward": 0.2708333395421505, + "rewards/cosine_scaled_reward": -0.032224420458078384, + "step": 242 + }, + { + "clip_fraction": 0.0, + "completion_length": 2664.4375610351562, + "epoch": 0.2777142857142857, + "grad_norm": 0.02101411111652851, + "kl": 0.00011355429887771606, + "lambda_div_used": 0.6568357795476913, + "learning_rate": 6.527578915497951e-07, + "loss": 0.0052, + "reward": 0.12949350103735924, + "reward_after_mean": 0.12949350103735924, + "reward_after_std": 0.7312840819358826, + "reward_before_mean": 0.46976747084409, + "reward_before_std": 0.7229422759264708, + "reward_change_max": 0.0, + "reward_change_mean": -0.3402740005403757, + "reward_change_min": -0.5887857899069786, + "reward_change_std": 0.2261042231693864, + "reward_std": 0.7312840968370438, + "rewards/accuracy_reward": 0.354166679084301, + "rewards/cosine_scaled_reward": 0.11560080386698246, + "step": 243 + }, + { + "clip_fraction": 0.0, + "completion_length": 2822.6875381469727, + "epoch": 0.27885714285714286, + "grad_norm": 0.021215323358774185, + "kl": 0.00015109777450561523, + "lambda_div_used": 0.6380957439541817, + "learning_rate": 6.496968239287603e-07, + "loss": 0.023, + "reward": 0.23113884031772614, + "reward_after_mean": 0.23113884031772614, + "reward_after_std": 0.6932291053235531, + "reward_before_mean": 0.678146418184042, + "reward_before_std": 0.6383852679282427, + "reward_change_max": 0.0, + "reward_change_mean": -0.4470075909048319, + "reward_change_min": -0.6521747056394815, + "reward_change_std": 0.2655975092202425, + "reward_std": 0.6932291202247143, + "rewards/accuracy_reward": 0.4583333469927311, + "rewards/cosine_scaled_reward": 0.21981305815279484, + "step": 244 + }, + { + "clip_fraction": 0.0, + "completion_length": 2480.2500915527344, + "epoch": 0.28, + "grad_norm": 0.026185423135757446, + "kl": 0.00016620755195617676, + "lambda_div_used": 0.6584924161434174, + "learning_rate": 6.466308972251785e-07, + "loss": 0.058, + "reward": 0.19090854283422232, + "reward_after_mean": 0.19090854283422232, + "reward_after_std": 0.726154362782836, + "reward_before_mean": 0.5613497914746404, + "reward_before_std": 0.7352566458284855, + "reward_change_max": 0.0, + "reward_change_mean": -0.37044124491512775, + "reward_change_min": -0.6325159706175327, + "reward_change_std": 0.25586483906954527, + "reward_std": 0.7261543925851583, + "rewards/accuracy_reward": 0.41666668094694614, + "rewards/cosine_scaled_reward": 0.14468309609219432, + "step": 245 + }, + { + "clip_fraction": 0.0, + "completion_length": 2683.291717529297, + "epoch": 0.28114285714285714, + "grad_norm": 0.020767759531736374, + "kl": 0.00016689300537109375, + "lambda_div_used": 0.613718219101429, + "learning_rate": 6.435602608679916e-07, + "loss": 0.0867, + "reward": -0.015127861872315407, + "reward_after_mean": -0.015127861872315407, + "reward_after_std": 0.5861052125692368, + "reward_before_mean": 0.36624928191304207, + "reward_before_std": 0.5300967525690794, + "reward_change_max": 0.0, + "reward_change_mean": -0.38137709721922874, + "reward_change_min": -0.6563880071043968, + "reward_change_std": 0.2452305220067501, + "reward_std": 0.5861052181571722, + "rewards/accuracy_reward": 0.3125000037252903, + "rewards/cosine_scaled_reward": 0.05374925094656646, + "step": 246 + }, + { + "clip_fraction": 0.0, + "completion_length": 3034.937545776367, + "epoch": 0.2822857142857143, + "grad_norm": 0.018628831952810287, + "kl": 0.00017150957137346268, + "lambda_div_used": 0.6288246288895607, + "learning_rate": 6.404850645156841e-07, + "loss": 0.0338, + "reward": -0.14502286911010742, + "reward_after_mean": -0.14502286911010742, + "reward_after_std": 0.6141320299357176, + "reward_before_mean": 0.1263586189597845, + "reward_before_std": 0.6014503743499517, + "reward_change_max": 0.0, + "reward_change_mean": -0.2713814973831177, + "reward_change_min": -0.5071048811078072, + "reward_change_std": 0.19316286500543356, + "reward_std": 0.61413205973804, + "rewards/accuracy_reward": 0.18750000186264515, + "rewards/cosine_scaled_reward": -0.06114138173870742, + "step": 247 + }, + { + "clip_fraction": 0.0, + "completion_length": 2029.1250228881836, + "epoch": 0.2834285714285714, + "grad_norm": 0.034633222967386246, + "kl": 0.00014004111289978027, + "lambda_div_used": 0.6094017848372459, + "learning_rate": 6.374054580489873e-07, + "loss": -0.0144, + "reward": 0.2783904932439327, + "reward_after_mean": 0.2783904932439327, + "reward_after_std": 0.6359313689172268, + "reward_before_mean": 0.8294162545353174, + "reward_before_std": 0.5048373020254076, + "reward_change_max": 0.0, + "reward_change_mean": -0.5510257538408041, + "reward_change_min": -0.7822528444230556, + "reward_change_std": 0.3142691068351269, + "reward_std": 0.6359313875436783, + "rewards/accuracy_reward": 0.5416666772216558, + "rewards/cosine_scaled_reward": 0.28774956427514553, + "step": 248 + }, + { + "clip_fraction": 0.0, + "completion_length": 1789.4791870117188, + "epoch": 0.2845714285714286, + "grad_norm": 0.028334610164165497, + "kl": 7.846951484680176e-05, + "lambda_div_used": 0.5886820033192635, + "learning_rate": 6.343215915635761e-07, + "loss": 0.0095, + "reward": -0.009319216012954712, + "reward_after_mean": -0.009319216012954712, + "reward_after_std": 0.5895384289324284, + "reward_before_mean": 0.46247682347893715, + "reward_before_std": 0.4071828918531537, + "reward_change_max": 0.0, + "reward_change_mean": -0.47179603204131126, + "reward_change_min": -0.6552967764437199, + "reward_change_std": 0.2524276301264763, + "reward_std": 0.589538436383009, + "rewards/accuracy_reward": 0.3333333358168602, + "rewards/cosine_scaled_reward": 0.1291434899903834, + "step": 249 + }, + { + "clip_fraction": 0.0, + "completion_length": 2342.8750610351562, + "epoch": 0.2857142857142857, + "grad_norm": 0.02926229126751423, + "kl": 0.00020366907119750977, + "lambda_div_used": 0.6367609649896622, + "learning_rate": 6.31233615362752e-07, + "loss": 0.0537, + "reward": 0.04997219145298004, + "reward_after_mean": 0.04997219145298004, + "reward_after_std": 0.6538249664008617, + "reward_before_mean": 0.4029387356713414, + "reward_before_std": 0.6333111096173525, + "reward_change_max": 0.0, + "reward_change_mean": -0.35296651534736156, + "reward_change_min": -0.5975028611719608, + "reward_change_std": 0.23207756876945496, + "reward_std": 0.6538249738514423, + "rewards/accuracy_reward": 0.31250001303851604, + "rewards/cosine_scaled_reward": 0.09043872263282537, + "step": 250 + }, + { + "clip_fraction": 0.0, + "completion_length": 1925.1667289733887, + "epoch": 0.28685714285714287, + "grad_norm": 0.03169158101081848, + "kl": 0.0001310408115386963, + "lambda_div_used": 0.6620426177978516, + "learning_rate": 6.281416799501187e-07, + "loss": -0.0552, + "reward": 0.16181311733089387, + "reward_after_mean": 0.16181311733089387, + "reward_after_std": 0.7404435630887747, + "reward_before_mean": 0.516552684828639, + "reward_before_std": 0.7539111012592912, + "reward_change_max": 0.0, + "reward_change_mean": -0.35473958775401115, + "reward_change_min": -0.6220344565808773, + "reward_change_std": 0.24936181399971247, + "reward_std": 0.7404435705393553, + "rewards/accuracy_reward": 0.39583334885537624, + "rewards/cosine_scaled_reward": 0.12071935646235943, + "step": 251 + }, + { + "clip_fraction": 0.0, + "completion_length": 2385.4166831970215, + "epoch": 0.288, + "grad_norm": 0.027474144473671913, + "kl": 0.00018829107284545898, + "lambda_div_used": 0.5771610513329506, + "learning_rate": 6.25045936022246e-07, + "loss": 0.0396, + "reward": -0.18753607827238739, + "reward_after_mean": -0.18753607827238739, + "reward_after_std": 0.4631412886083126, + "reward_before_mean": 0.1804720275104046, + "reward_before_std": 0.3519942844286561, + "reward_change_max": 0.0, + "reward_change_mean": -0.3680081032216549, + "reward_change_min": -0.5239151008427143, + "reward_change_std": 0.20504287257790565, + "reward_std": 0.46314129047095776, + "rewards/accuracy_reward": 0.22916666977107525, + "rewards/cosine_scaled_reward": -0.04869466880336404, + "step": 252 + }, + { + "clip_fraction": 0.0, + "completion_length": 2590.875045776367, + "epoch": 0.28914285714285715, + "grad_norm": 0.027266530320048332, + "kl": 0.0001958310604095459, + "lambda_div_used": 0.6256092488765717, + "learning_rate": 6.219465344613258e-07, + "loss": -0.0262, + "reward": 0.09082555398344994, + "reward_after_mean": 0.09082555398344994, + "reward_after_std": 0.6417571641504765, + "reward_before_mean": 0.5176093801856041, + "reward_before_std": 0.5764628401957452, + "reward_change_max": 0.0, + "reward_change_mean": -0.4267838429659605, + "reward_change_min": -0.7001185156404972, + "reward_change_std": 0.26451323740184307, + "reward_std": 0.6417571865022182, + "rewards/accuracy_reward": 0.39583334140479565, + "rewards/cosine_scaled_reward": 0.12177603470627218, + "step": 253 + }, + { + "clip_fraction": 0.0, + "completion_length": 2205.3333587646484, + "epoch": 0.29028571428571426, + "grad_norm": 0.03558209538459778, + "kl": 0.0001392364501953125, + "lambda_div_used": 0.6507852524518967, + "learning_rate": 6.188436263278172e-07, + "loss": -0.0987, + "reward": 0.18358214199543, + "reward_after_mean": 0.18358214199543, + "reward_after_std": 0.7428858652710915, + "reward_before_mean": 0.5856727678328753, + "reward_before_std": 0.7034552115947008, + "reward_change_max": 0.0, + "reward_change_mean": -0.40209066309034824, + "reward_change_min": -0.6650605984032154, + "reward_change_std": 0.2646036548539996, + "reward_std": 0.7428858801722527, + "rewards/accuracy_reward": 0.43750000931322575, + "rewards/cosine_scaled_reward": 0.14817279600538313, + "step": 254 + }, + { + "clip_fraction": 0.0, + "completion_length": 3061.5625610351562, + "epoch": 0.2914285714285714, + "grad_norm": 0.026399368420243263, + "kl": 0.00017392635345458984, + "lambda_div_used": 0.6185515820980072, + "learning_rate": 6.157373628530852e-07, + "loss": 0.023, + "reward": -0.10978002939373255, + "reward_after_mean": -0.10978002939373255, + "reward_after_std": 0.5691167917102575, + "reward_before_mean": 0.19686487689614296, + "reward_before_std": 0.5497966632246971, + "reward_change_max": 0.0, + "reward_change_mean": -0.3066448848694563, + "reward_change_min": -0.5226548612117767, + "reward_change_std": 0.20487169921398163, + "reward_std": 0.569116810336709, + "rewards/accuracy_reward": 0.22916667349636555, + "rewards/cosine_scaled_reward": -0.032301797065883875, + "step": 255 + }, + { + "clip_fraction": 0.0, + "completion_length": 2540.5833435058594, + "epoch": 0.2925714285714286, + "grad_norm": 0.02308651991188526, + "kl": 0.00019800662994384766, + "lambda_div_used": 0.6459387838840485, + "learning_rate": 6.126278954320294e-07, + "loss": -0.0509, + "reward": 0.1737559838220477, + "reward_after_mean": 0.1737559838220477, + "reward_after_std": 0.6614836137741804, + "reward_before_mean": 0.5523870065808296, + "reward_before_std": 0.6789623461663723, + "reward_change_max": 0.0, + "reward_change_mean": -0.37863098084926605, + "reward_change_min": -0.6470838598906994, + "reward_change_std": 0.26314268447458744, + "reward_std": 0.661483621224761, + "rewards/accuracy_reward": 0.3958333469927311, + "rewards/cosine_scaled_reward": 0.15655364841222763, + "step": 256 + }, + { + "clip_fraction": 0.0, + "completion_length": 2811.0209045410156, + "epoch": 0.2937142857142857, + "grad_norm": 0.021752549335360527, + "kl": 0.00017321109771728516, + "lambda_div_used": 0.6104201078414917, + "learning_rate": 6.095153756157051e-07, + "loss": 0.077, + "reward": 0.32746705412864685, + "reward_after_mean": 0.32746705412864685, + "reward_after_std": 0.627650348469615, + "reward_before_mean": 0.9031309094280005, + "reward_before_std": 0.5052130986005068, + "reward_change_max": 0.0, + "reward_change_mean": -0.5756638199090958, + "reward_change_min": -0.8629779443144798, + "reward_change_std": 0.33332069404423237, + "reward_std": 0.6276503596454859, + "rewards/accuracy_reward": 0.5625000111758709, + "rewards/cosine_scaled_reward": 0.3406308852136135, + "step": 257 + }, + { + "clip_fraction": 0.0, + "completion_length": 3115.166732788086, + "epoch": 0.2948571428571429, + "grad_norm": 0.019492125138640404, + "kl": 0.00021690130233764648, + "lambda_div_used": 0.6207298263907433, + "learning_rate": 6.06399955103937e-07, + "loss": 0.0613, + "reward": 0.0009925179183483124, + "reward_after_mean": 0.0009925179183483124, + "reward_after_std": 0.5666598528623581, + "reward_before_mean": 0.35771505534648895, + "reward_before_std": 0.5585699509829283, + "reward_change_max": 0.0, + "reward_change_mean": -0.3567225467413664, + "reward_change_min": -0.6022392623126507, + "reward_change_std": 0.23685699328780174, + "reward_std": 0.5666598528623581, + "rewards/accuracy_reward": 0.3125000111758709, + "rewards/cosine_scaled_reward": 0.04521506559103727, + "step": 258 + }, + { + "clip_fraction": 0.0, + "completion_length": 2637.2500534057617, + "epoch": 0.296, + "grad_norm": 0.029653117060661316, + "kl": 0.00020319223403930664, + "lambda_div_used": 0.6043171733617783, + "learning_rate": 6.032817857379256e-07, + "loss": -0.0019, + "reward": -0.037313513457775116, + "reward_after_mean": -0.037313513457775116, + "reward_after_std": 0.5161938592791557, + "reward_before_mean": 0.3289037337526679, + "reward_before_std": 0.47458031587302685, + "reward_change_max": 0.0, + "reward_change_mean": -0.3662172295153141, + "reward_change_min": -0.5700537078082561, + "reward_change_std": 0.2212026845663786, + "reward_std": 0.5161938853561878, + "rewards/accuracy_reward": 0.33333334140479565, + "rewards/cosine_scaled_reward": -0.004429628141224384, + "step": 259 + }, + { + "clip_fraction": 0.0, + "completion_length": 1935.9166679382324, + "epoch": 0.29714285714285715, + "grad_norm": 0.028990233317017555, + "kl": 0.00012747198343276978, + "lambda_div_used": 0.56998710334301, + "learning_rate": 6.001610194928464e-07, + "loss": -0.0104, + "reward": 0.24466626904904842, + "reward_after_mean": 0.24466626904904842, + "reward_after_std": 0.5734536852687597, + "reward_before_mean": 0.9249862097203732, + "reward_before_std": 0.32228614180348814, + "reward_change_max": 0.0, + "reward_change_mean": -0.6803199425339699, + "reward_change_min": -0.9398231357336044, + "reward_change_std": 0.3666897714138031, + "reward_std": 0.5734537076205015, + "rewards/accuracy_reward": 0.6041666679084301, + "rewards/cosine_scaled_reward": 0.3208195334300399, + "step": 260 + }, + { + "clip_fraction": 0.0, + "completion_length": 2764.8958892822266, + "epoch": 0.29828571428571427, + "grad_norm": 0.021772203966975212, + "kl": 0.00015366077423095703, + "lambda_div_used": 0.6116980388760567, + "learning_rate": 5.97037808470444e-07, + "loss": 0.0391, + "reward": -0.14533425867557526, + "reward_after_mean": -0.14533425867557526, + "reward_after_std": 0.5442243628203869, + "reward_before_mean": 0.15450917836278677, + "reward_before_std": 0.5254541491158307, + "reward_change_max": 0.0, + "reward_change_mean": -0.29984344728291035, + "reward_change_min": -0.5264418311417103, + "reward_change_std": 0.2055408162996173, + "reward_std": 0.5442243684083223, + "rewards/accuracy_reward": 0.1875000037252903, + "rewards/cosine_scaled_reward": -0.03299082722514868, + "step": 261 + }, + { + "clip_fraction": 0.0, + "completion_length": 2946.541732788086, + "epoch": 0.29942857142857143, + "grad_norm": 0.02444116398692131, + "kl": 0.00019305944442749023, + "lambda_div_used": 0.5770210847258568, + "learning_rate": 5.939123048916173e-07, + "loss": -0.0078, + "reward": -0.30842714570462704, + "reward_after_mean": -0.30842714570462704, + "reward_after_std": 0.4061661623418331, + "reward_before_mean": -0.014746684581041336, + "reward_before_std": 0.35232585947960615, + "reward_change_max": 0.0, + "reward_change_mean": -0.29368047416210175, + "reward_change_min": -0.43233491107821465, + "reward_change_std": 0.167787273414433, + "reward_std": 0.40616616792976856, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/cosine_scaled_reward": -0.11891335435211658, + "step": 262 + }, + { + "clip_fraction": 0.0, + "completion_length": 2761.7083740234375, + "epoch": 0.30057142857142854, + "grad_norm": 0.026775242760777473, + "kl": 0.00015038251876831055, + "lambda_div_used": 0.5933946445584297, + "learning_rate": 5.907846610890011e-07, + "loss": -0.0212, + "reward": -0.28860565181821585, + "reward_after_mean": -0.28860565181821585, + "reward_after_std": 0.4898714739829302, + "reward_before_mean": -0.010391712188720703, + "reward_before_std": 0.4291188698261976, + "reward_change_max": 0.0, + "reward_change_mean": -0.27821394614875317, + "reward_change_min": -0.43029162287712097, + "reward_change_std": 0.16667384281754494, + "reward_std": 0.48987148329615593, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/cosine_scaled_reward": -0.11455838289111853, + "step": 263 + }, + { + "clip_fraction": 0.0, + "completion_length": 2586.3750534057617, + "epoch": 0.3017142857142857, + "grad_norm": 0.01994405686855316, + "kl": 0.0001627206802368164, + "lambda_div_used": 0.6219401434063911, + "learning_rate": 5.87655029499542e-07, + "loss": 0.043, + "reward": -0.11865252908319235, + "reward_after_mean": -0.11865252908319235, + "reward_after_std": 0.59744056686759, + "reward_before_mean": 0.1666876282542944, + "reward_before_std": 0.5631906799972057, + "reward_change_max": 0.0, + "reward_change_mean": -0.2853401657193899, + "reward_change_min": -0.4587775580585003, + "reward_change_std": 0.18016593530774117, + "reward_std": 0.597440579906106, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/cosine_scaled_reward": -0.04164570523425937, + "step": 264 + }, + { + "clip_fraction": 0.0, + "completion_length": 1858.2083587646484, + "epoch": 0.3028571428571429, + "grad_norm": 0.02953009493649006, + "kl": 0.0001354515552520752, + "lambda_div_used": 0.5754420235753059, + "learning_rate": 5.845235626570683e-07, + "loss": 0.1211, + "reward": 0.017749376595020294, + "reward_after_mean": 0.017749376595020294, + "reward_after_std": 0.478146318346262, + "reward_before_mean": 0.4997409600764513, + "reward_before_std": 0.3383461497724056, + "reward_change_max": 0.0, + "reward_change_mean": -0.48199158161878586, + "reward_change_min": -0.6475610621273518, + "reward_change_std": 0.2544400207698345, + "reward_std": 0.4781463425606489, + "rewards/accuracy_reward": 0.37500000558793545, + "rewards/cosine_scaled_reward": 0.12474094179924577, + "step": 265 + }, + { + "clip_fraction": 0.0, + "completion_length": 3198.625, + "epoch": 0.304, + "grad_norm": 0.019427413120865822, + "kl": 0.00019751489162445068, + "lambda_div_used": 0.5534809529781342, + "learning_rate": 5.813904131848564e-07, + "loss": -0.0091, + "reward": -0.40436042100191116, + "reward_after_mean": -0.40436042100191116, + "reward_after_std": 0.2967198472470045, + "reward_before_mean": -0.12292576022446156, + "reward_before_std": 0.23972244351170957, + "reward_change_max": 0.0, + "reward_change_mean": -0.2814346421509981, + "reward_change_min": -0.43088357895612717, + "reward_change_std": 0.16106584202498198, + "reward_std": 0.29671985376626253, + "rewards/accuracy_reward": 0.1041666716337204, + "rewards/cosine_scaled_reward": -0.22709244303405285, + "step": 266 + }, + { + "clip_fraction": 0.0, + "completion_length": 2879.354202270508, + "epoch": 0.30514285714285716, + "grad_norm": 0.02146352268755436, + "kl": 0.00019466876983642578, + "lambda_div_used": 0.6196897253394127, + "learning_rate": 5.78255733788191e-07, + "loss": -0.0062, + "reward": -0.12080054543912411, + "reward_after_mean": -0.12080054543912411, + "reward_after_std": 0.5834086053073406, + "reward_before_mean": 0.1734671276062727, + "reward_before_std": 0.5533927101641893, + "reward_change_max": 0.0, + "reward_change_mean": -0.29426765628159046, + "reward_change_min": -0.4760690741240978, + "reward_change_std": 0.18818860314786434, + "reward_std": 0.5834086183458567, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/cosine_scaled_reward": -0.03486621752381325, + "step": 267 + }, + { + "clip_fraction": 0.0, + "completion_length": 2556.5208740234375, + "epoch": 0.3062857142857143, + "grad_norm": 0.03222980722784996, + "kl": 0.00019940733909606934, + "lambda_div_used": 0.6217963546514511, + "learning_rate": 5.751196772469237e-07, + "loss": 0.11, + "reward": -0.0901529286056757, + "reward_after_mean": -0.0901529286056757, + "reward_after_std": 0.5890399143099785, + "reward_before_mean": 0.22261973470449448, + "reward_before_std": 0.5651240181177855, + "reward_change_max": 0.0, + "reward_change_mean": -0.3127726651728153, + "reward_change_min": -0.5852204114198685, + "reward_change_std": 0.21594477724283934, + "reward_std": 0.5890399310737848, + "rewards/accuracy_reward": 0.2708333395421505, + "rewards/cosine_scaled_reward": -0.04821361042559147, + "step": 268 + }, + { + "clip_fraction": 0.0, + "completion_length": 2763.0000534057617, + "epoch": 0.30742857142857144, + "grad_norm": 0.023989371955394745, + "kl": 0.00017099082469940186, + "lambda_div_used": 0.5594866573810577, + "learning_rate": 5.71982396408026e-07, + "loss": 0.04, + "reward": 0.10778852179646492, + "reward_after_mean": 0.10778852179646492, + "reward_after_std": 0.45204984955489635, + "reward_before_mean": 0.7018298227339983, + "reward_before_std": 0.26720918249338865, + "reward_change_max": 0.0, + "reward_change_mean": -0.5940412897616625, + "reward_change_min": -0.8074228167533875, + "reward_change_std": 0.31475239619612694, + "reward_std": 0.4520498663187027, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/cosine_scaled_reward": 0.22266313433647156, + "step": 269 + }, + { + "clip_fraction": 0.0, + "completion_length": 2451.7917404174805, + "epoch": 0.30857142857142855, + "grad_norm": 0.02231750823557377, + "kl": 0.00018018484115600586, + "lambda_div_used": 0.6626150384545326, + "learning_rate": 5.688440441781398e-07, + "loss": 0.0293, + "reward": 0.23073547426611185, + "reward_after_mean": 0.23073547426611185, + "reward_after_std": 0.8142610676586628, + "reward_before_mean": 0.6419318169355392, + "reward_before_std": 0.7594864275306463, + "reward_change_max": 0.0, + "reward_change_mean": -0.4111963789910078, + "reward_change_min": -0.6618837527930737, + "reward_change_std": 0.26606686785817146, + "reward_std": 0.8142610862851143, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/cosine_scaled_reward": 0.18359847948886454, + "step": 270 + }, + { + "clip_fraction": 0.0, + "completion_length": 1821.8333740234375, + "epoch": 0.3097142857142857, + "grad_norm": 0.029715267941355705, + "kl": 0.00010024569928646088, + "lambda_div_used": 0.6296036839485168, + "learning_rate": 5.657047735161255e-07, + "loss": -0.127, + "reward": 0.15136122331023216, + "reward_after_mean": 0.15136122331023216, + "reward_after_std": 0.6207431796938181, + "reward_before_mean": 0.5634245574474335, + "reward_before_std": 0.6018376401625574, + "reward_change_max": 0.0, + "reward_change_mean": -0.4120633378624916, + "reward_change_min": -0.6677181459963322, + "reward_change_std": 0.26569564640522003, + "reward_std": 0.6207432132214308, + "rewards/accuracy_reward": 0.4375000111758709, + "rewards/cosine_scaled_reward": 0.1259245565161109, + "step": 271 + }, + { + "clip_fraction": 0.0, + "completion_length": 2659.0834045410156, + "epoch": 0.31085714285714283, + "grad_norm": 0.025181055068969727, + "kl": 0.0002090930938720703, + "lambda_div_used": 0.6270301192998886, + "learning_rate": 5.625647374256061e-07, + "loss": 0.0218, + "reward": 0.11005037371069193, + "reward_after_mean": 0.11005037371069193, + "reward_after_std": 0.6581083033233881, + "reward_before_mean": 0.5188289349898696, + "reward_before_std": 0.59244554489851, + "reward_change_max": 0.0, + "reward_change_mean": -0.40877855755388737, + "reward_change_min": -0.6175772212445736, + "reward_change_std": 0.2535307565703988, + "reward_std": 0.6581083126366138, + "rewards/accuracy_reward": 0.39583334140479565, + "rewards/cosine_scaled_reward": 0.12299557868391275, + "step": 272 + }, + { + "clip_fraction": 0.0, + "completion_length": 2516.875045776367, + "epoch": 0.312, + "grad_norm": 0.022818049415946007, + "kl": 0.00016960501670837402, + "lambda_div_used": 0.5807301178574562, + "learning_rate": 5.594240889475106e-07, + "loss": -0.0175, + "reward": 0.09495561942458153, + "reward_after_mean": 0.09495561942458153, + "reward_after_std": 0.5033265259116888, + "reward_before_mean": 0.6270047463476658, + "reward_before_std": 0.3717161314561963, + "reward_change_max": 0.0, + "reward_change_mean": -0.5320491325110197, + "reward_change_min": -0.7614771388471127, + "reward_change_std": 0.30045478232204914, + "reward_std": 0.5033265501260757, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/cosine_scaled_reward": 0.2103380784392357, + "step": 273 + }, + { + "clip_fraction": 0.0, + "completion_length": 1573.3750228881836, + "epoch": 0.31314285714285717, + "grad_norm": 0.03588717430830002, + "kl": 0.00010608136653900146, + "lambda_div_used": 0.6433197036385536, + "learning_rate": 5.562829811526154e-07, + "loss": -0.0021, + "reward": 0.24507278576493263, + "reward_after_mean": 0.24507278576493263, + "reward_after_std": 0.7453816495835781, + "reward_before_mean": 0.6975303117651492, + "reward_before_std": 0.6636776090599597, + "reward_change_max": 0.0, + "reward_change_mean": -0.4524575434625149, + "reward_change_min": -0.703257791697979, + "reward_change_std": 0.2789953136816621, + "reward_std": 0.7453816495835781, + "rewards/accuracy_reward": 0.479166679084301, + "rewards/cosine_scaled_reward": 0.21836365200579166, + "step": 274 + }, + { + "clip_fraction": 0.0, + "completion_length": 2231.375015258789, + "epoch": 0.3142857142857143, + "grad_norm": 0.02452162466943264, + "kl": 0.00013327598571777344, + "lambda_div_used": 0.5854036509990692, + "learning_rate": 5.531415671340826e-07, + "loss": 0.0351, + "reward": 0.1521737277507782, + "reward_after_mean": 0.1521737277507782, + "reward_after_std": 0.5359849948436022, + "reward_before_mean": 0.6887877276167274, + "reward_before_std": 0.3889514375478029, + "reward_change_max": 0.0, + "reward_change_mean": -0.5366139896214008, + "reward_change_min": -0.7292314879596233, + "reward_change_std": 0.28961729165166616, + "reward_std": 0.5359850041568279, + "rewards/accuracy_reward": 0.47916667722165585, + "rewards/cosine_scaled_reward": 0.20962106250226498, + "step": 275 + }, + { + "clip_fraction": 0.0, + "completion_length": 2479.375015258789, + "epoch": 0.31542857142857145, + "grad_norm": 0.022260351106524467, + "kl": 0.0001841336488723755, + "lambda_div_used": 0.6080946400761604, + "learning_rate": 5.5e-07, + "loss": 0.0233, + "reward": 0.15685568936169147, + "reward_after_mean": 0.15685568936169147, + "reward_after_std": 0.6163474209606647, + "reward_before_mean": 0.6554346140474081, + "reward_before_std": 0.5032088747248054, + "reward_change_max": 0.0, + "reward_change_mean": -0.49857890233397484, + "reward_change_min": -0.7599121108651161, + "reward_change_std": 0.2981911161914468, + "reward_std": 0.6163474582135677, + "rewards/accuracy_reward": 0.4583333395421505, + "rewards/cosine_scaled_reward": 0.19710125587880611, + "step": 276 + }, + { + "clip_fraction": 0.0, + "completion_length": 2441.625045776367, + "epoch": 0.31657142857142856, + "grad_norm": 0.0253590177744627, + "kl": 0.0002009868621826172, + "lambda_div_used": 0.662374809384346, + "learning_rate": 5.468584328659172e-07, + "loss": 0.0379, + "reward": 0.13848626799881458, + "reward_after_mean": 0.13848626799881458, + "reward_after_std": 0.8288281839340925, + "reward_before_mean": 0.5136084349360317, + "reward_before_std": 0.7587530063465238, + "reward_change_max": 0.0, + "reward_change_mean": -0.37512217462062836, + "reward_change_min": -0.6195148192346096, + "reward_change_std": 0.24048541858792305, + "reward_std": 0.8288282137364149, + "rewards/accuracy_reward": 0.37500000558793545, + "rewards/cosine_scaled_reward": 0.13860841654241085, + "step": 277 + }, + { + "clip_fraction": 0.0, + "completion_length": 2101.729202270508, + "epoch": 0.3177142857142857, + "grad_norm": 0.03273219242691994, + "kl": 0.0001831650733947754, + "lambda_div_used": 0.5535080209374428, + "learning_rate": 5.437170188473847e-07, + "loss": 0.0176, + "reward": 0.026296844705939293, + "reward_after_mean": 0.026296844705939293, + "reward_after_std": 0.47010152228176594, + "reward_before_mean": 0.6100956231821328, + "reward_before_std": 0.24391112057492137, + "reward_change_max": 0.0, + "reward_change_mean": -0.5837987624108791, + "reward_change_min": -0.8103098906576633, + "reward_change_std": 0.3077183160930872, + "reward_std": 0.4701015278697014, + "rewards/accuracy_reward": 0.4166666679084301, + "rewards/cosine_scaled_reward": 0.19342893542489037, + "step": 278 + }, + { + "clip_fraction": 0.0, + "completion_length": 3132.5416717529297, + "epoch": 0.31885714285714284, + "grad_norm": 0.021023932844400406, + "kl": 0.000225067138671875, + "lambda_div_used": 0.6101865917444229, + "learning_rate": 5.405759110524894e-07, + "loss": 0.0252, + "reward": -0.24995685555040836, + "reward_after_mean": -0.24995685555040836, + "reward_after_std": 0.5465981848537922, + "reward_before_mean": 0.00569869764149189, + "reward_before_std": 0.5138198006898165, + "reward_change_max": 0.0, + "reward_change_mean": -0.25565553829073906, + "reward_change_min": -0.42565521970391273, + "reward_change_std": 0.1681571202352643, + "reward_std": 0.5465981848537922, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/cosine_scaled_reward": -0.14013465493917465, + "step": 279 + }, + { + "clip_fraction": 0.0, + "completion_length": 2124.208381652832, + "epoch": 0.32, + "grad_norm": 0.03544396162033081, + "kl": 0.000164031982421875, + "lambda_div_used": 0.6441325098276138, + "learning_rate": 5.37435262574394e-07, + "loss": 0.0491, + "reward": 0.1208603996783495, + "reward_after_mean": 0.1208603996783495, + "reward_after_std": 0.6665525771677494, + "reward_before_mean": 0.48556846380233765, + "reward_before_std": 0.6708142012357712, + "reward_change_max": 0.0, + "reward_change_mean": -0.364708062261343, + "reward_change_min": -0.6562114134430885, + "reward_change_std": 0.25303495209664106, + "reward_std": 0.66655258461833, + "rewards/accuracy_reward": 0.354166679084301, + "rewards/cosine_scaled_reward": 0.13140178471803665, + "step": 280 + }, + { + "clip_fraction": 0.0, + "completion_length": 3433.6459045410156, + "epoch": 0.3211428571428571, + "grad_norm": 0.0178577471524477, + "kl": 0.00023686885833740234, + "lambda_div_used": 0.5564139187335968, + "learning_rate": 5.342952264838747e-07, + "loss": 0.0092, + "reward": -0.4361804537475109, + "reward_after_mean": -0.4361804537475109, + "reward_after_std": 0.3096983712166548, + "reward_before_mean": -0.1626793835312128, + "reward_before_std": 0.2517517115920782, + "reward_change_max": 0.0, + "reward_change_mean": -0.2735010664910078, + "reward_change_min": -0.44846677780151367, + "reward_change_std": 0.1593692358583212, + "reward_std": 0.3096983730792999, + "rewards/accuracy_reward": 0.0625, + "rewards/cosine_scaled_reward": -0.22517940029501915, + "step": 281 + }, + { + "clip_fraction": 0.0, + "completion_length": 2314.645866394043, + "epoch": 0.3222857142857143, + "grad_norm": 0.023510051891207695, + "kl": 0.00020015239715576172, + "lambda_div_used": 0.5891857892274857, + "learning_rate": 5.311559558218603e-07, + "loss": -0.0533, + "reward": 0.015196382999420166, + "reward_after_mean": 0.015196382999420166, + "reward_after_std": 0.506258824840188, + "reward_before_mean": 0.4649945506826043, + "reward_before_std": 0.40477199107408524, + "reward_change_max": 0.0, + "reward_change_mean": -0.449798122048378, + "reward_change_min": -0.6414072066545486, + "reward_change_std": 0.2532971305772662, + "reward_std": 0.5062588378787041, + "rewards/accuracy_reward": 0.35416667722165585, + "rewards/cosine_scaled_reward": 0.11082786321640015, + "step": 282 + }, + { + "clip_fraction": 0.0, + "completion_length": 2393.854232788086, + "epoch": 0.32342857142857145, + "grad_norm": 0.02048996463418007, + "kl": 0.00015240907669067383, + "lambda_div_used": 0.587490864098072, + "learning_rate": 5.28017603591974e-07, + "loss": 0.0139, + "reward": 0.30065850354731083, + "reward_after_mean": 0.30065850354731083, + "reward_after_std": 0.5727098472416401, + "reward_before_mean": 0.9280985994264483, + "reward_before_std": 0.3965794490650296, + "reward_change_max": 0.0, + "reward_change_mean": -0.6274400968104601, + "reward_change_min": -0.8531196974217892, + "reward_change_std": 0.3378349719569087, + "reward_std": 0.5727098621428013, + "rewards/accuracy_reward": 0.6041666772216558, + "rewards/cosine_scaled_reward": 0.3239319231361151, + "step": 283 + }, + { + "clip_fraction": 0.0, + "completion_length": 2126.020851135254, + "epoch": 0.32457142857142857, + "grad_norm": 0.028623223304748535, + "kl": 0.00013819336891174316, + "lambda_div_used": 0.6661800816655159, + "learning_rate": 5.248803227530763e-07, + "loss": 0.0092, + "reward": 0.09131545946002007, + "reward_after_mean": 0.09131545946002007, + "reward_after_std": 0.7503824215382338, + "reward_before_mean": 0.4026043973863125, + "reward_before_std": 0.788893286138773, + "reward_change_max": 0.0, + "reward_change_mean": -0.3112889491021633, + "reward_change_min": -0.6233196444809437, + "reward_change_std": 0.24689115211367607, + "reward_std": 0.7503824215382338, + "rewards/accuracy_reward": 0.3125000037252903, + "rewards/cosine_scaled_reward": 0.09010439366102219, + "step": 284 + }, + { + "clip_fraction": 0.0, + "completion_length": 1871.5000228881836, + "epoch": 0.32571428571428573, + "grad_norm": 0.02662411518394947, + "kl": 0.00016036629676818848, + "lambda_div_used": 0.576582707464695, + "learning_rate": 5.21744266211809e-07, + "loss": 0.0319, + "reward": -0.3101608529686928, + "reward_after_mean": -0.3101608529686928, + "reward_after_std": 0.41523571871221066, + "reward_before_mean": -0.01919533498585224, + "reward_before_std": 0.3510099109262228, + "reward_change_max": 0.0, + "reward_change_mean": -0.2909655049443245, + "reward_change_min": -0.4455920048058033, + "reward_change_std": 0.16767465602606535, + "reward_std": 0.4152357243001461, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/cosine_scaled_reward": -0.12336201290600002, + "step": 285 + }, + { + "clip_fraction": 0.0, + "completion_length": 2349.000030517578, + "epoch": 0.32685714285714285, + "grad_norm": 0.02408268116414547, + "kl": 0.00019100308418273926, + "lambda_div_used": 0.6201390102505684, + "learning_rate": 5.186095868151436e-07, + "loss": 0.0307, + "reward": 0.21592768095433712, + "reward_after_mean": 0.21592768095433712, + "reward_after_std": 0.66009739972651, + "reward_before_mean": 0.7218069694936275, + "reward_before_std": 0.554044695571065, + "reward_change_max": 0.0, + "reward_change_mean": -0.5058793053030968, + "reward_change_min": -0.7697652019560337, + "reward_change_std": 0.2988813826814294, + "reward_std": 0.6600974258035421, + "rewards/accuracy_reward": 0.5000000093132257, + "rewards/cosine_scaled_reward": 0.22180695831775665, + "step": 286 + }, + { + "clip_fraction": 0.0, + "completion_length": 1718.604175567627, + "epoch": 0.328, + "grad_norm": 0.03919753059744835, + "kl": 0.00013002753257751465, + "lambda_div_used": 0.5975622236728668, + "learning_rate": 5.154764373429315e-07, + "loss": -0.1056, + "reward": 0.03168256084609311, + "reward_after_mean": 0.03168256084609311, + "reward_after_std": 0.5345403701066971, + "reward_before_mean": 0.4581381119787693, + "reward_before_std": 0.44751388020813465, + "reward_change_max": 0.0, + "reward_change_mean": -0.42645558528602123, + "reward_change_min": -0.617030244320631, + "reward_change_std": 0.24450463335961103, + "reward_std": 0.5345403775572777, + "rewards/accuracy_reward": 0.33333334140479565, + "rewards/cosine_scaled_reward": 0.12480480223894119, + "step": 287 + }, + { + "clip_fraction": 0.0, + "completion_length": 2906.3541717529297, + "epoch": 0.3291428571428571, + "grad_norm": 0.019498826935887337, + "kl": 0.00022274255752563477, + "lambda_div_used": 0.5806883201003075, + "learning_rate": 5.123449705004581e-07, + "loss": 0.0055, + "reward": -0.18337237276136875, + "reward_after_mean": -0.18337237276136875, + "reward_after_std": 0.4604283105581999, + "reward_before_mean": 0.1833638995885849, + "reward_before_std": 0.3699948964640498, + "reward_change_max": 0.0, + "reward_change_mean": -0.36673627234995365, + "reward_change_min": -0.5305228792130947, + "reward_change_std": 0.20658569782972336, + "reward_std": 0.4604283291846514, + "rewards/accuracy_reward": 0.22916666977107525, + "rewards/cosine_scaled_reward": -0.04580276645720005, + "step": 288 + }, + { + "clip_fraction": 0.0, + "completion_length": 2202.666702270508, + "epoch": 0.3302857142857143, + "grad_norm": 0.033871494233608246, + "kl": 0.00016301870346069336, + "lambda_div_used": 0.5840093046426773, + "learning_rate": 5.09215338910999e-07, + "loss": -0.0478, + "reward": 0.007814206182956696, + "reward_after_mean": 0.007814206182956696, + "reward_after_std": 0.4950754214078188, + "reward_before_mean": 0.46548917703330517, + "reward_before_std": 0.381549178622663, + "reward_change_max": 0.0, + "reward_change_mean": -0.45767495781183243, + "reward_change_min": -0.6326170898973942, + "reward_change_std": 0.24920715391635895, + "reward_std": 0.49507543072104454, + "rewards/accuracy_reward": 0.37500000558793545, + "rewards/cosine_scaled_reward": 0.09048915281891823, + "step": 289 + }, + { + "clip_fraction": 0.0, + "completion_length": 1533.6042251586914, + "epoch": 0.3314285714285714, + "grad_norm": 0.032007429748773575, + "kl": 0.00014695525169372559, + "lambda_div_used": 0.6211200878024101, + "learning_rate": 5.060876951083828e-07, + "loss": 0.0764, + "reward": 0.04068056936375797, + "reward_after_mean": 0.04068056936375797, + "reward_after_std": 0.6179232522845268, + "reward_before_mean": 0.4251148612238467, + "reward_before_std": 0.556100070476532, + "reward_change_max": 0.0, + "reward_change_mean": -0.3844342865049839, + "reward_change_min": -0.5800922103226185, + "reward_change_std": 0.22699587792158127, + "reward_std": 0.617923267185688, + "rewards/accuracy_reward": 0.3333333395421505, + "rewards/cosine_scaled_reward": 0.09178150352090597, + "step": 290 + }, + { + "clip_fraction": 0.0, + "completion_length": 1996.187515258789, + "epoch": 0.3325714285714286, + "grad_norm": 0.024394486099481583, + "kl": 0.00016900897026062012, + "lambda_div_used": 0.67839565128088, + "learning_rate": 5.02962191529556e-07, + "loss": 0.0077, + "reward": 0.09233464859426022, + "reward_after_mean": 0.09233464859426022, + "reward_after_std": 0.8132282309234142, + "reward_before_mean": 0.3814069051295519, + "reward_before_std": 0.8372865226119757, + "reward_change_max": 0.0, + "reward_change_mean": -0.2890722490847111, + "reward_change_min": -0.5995339304208755, + "reward_change_std": 0.22939695976674557, + "reward_std": 0.8132282607257366, + "rewards/accuracy_reward": 0.31250000558793545, + "rewards/cosine_scaled_reward": 0.06890689395368099, + "step": 291 + }, + { + "clip_fraction": 0.0, + "completion_length": 2611.541702270508, + "epoch": 0.33371428571428574, + "grad_norm": 0.022669553756713867, + "kl": 0.00019761919975280762, + "lambda_div_used": 0.6018925532698631, + "learning_rate": 4.998389805071536e-07, + "loss": 0.0303, + "reward": -0.11102894321084023, + "reward_after_mean": -0.11102894321084023, + "reward_after_std": 0.5624180883169174, + "reward_before_mean": 0.24086102936416864, + "reward_before_std": 0.4718956621363759, + "reward_change_max": 0.0, + "reward_change_mean": -0.35188993997871876, + "reward_change_min": -0.5550865493714809, + "reward_change_std": 0.21156923100352287, + "reward_std": 0.5624181143939495, + "rewards/accuracy_reward": 0.27083333767950535, + "rewards/cosine_scaled_reward": -0.029972338117659092, + "step": 292 + }, + { + "clip_fraction": 0.0, + "completion_length": 2105.812545776367, + "epoch": 0.33485714285714285, + "grad_norm": 0.022291820496320724, + "kl": 0.00020776689052581787, + "lambda_div_used": 0.5899104326963425, + "learning_rate": 4.967182142620745e-07, + "loss": -0.0174, + "reward": -0.15591239370405674, + "reward_after_mean": -0.15591239370405674, + "reward_after_std": 0.4546964541077614, + "reward_before_mean": 0.18034806847572327, + "reward_before_std": 0.4065048359334469, + "reward_change_max": 0.0, + "reward_change_mean": -0.33626046776771545, + "reward_change_min": -0.5091775916516781, + "reward_change_std": 0.19646561425179243, + "reward_std": 0.4546964690089226, + "rewards/accuracy_reward": 0.22916667722165585, + "rewards/cosine_scaled_reward": -0.048818591982126236, + "step": 293 + }, + { + "clip_fraction": 0.0, + "completion_length": 3101.562545776367, + "epoch": 0.336, + "grad_norm": 0.020145803689956665, + "kl": 0.0002519190311431885, + "lambda_div_used": 0.5795014202594757, + "learning_rate": 4.93600044896063e-07, + "loss": -0.0181, + "reward": -0.29703800566494465, + "reward_after_mean": -0.29703800566494465, + "reward_after_std": 0.4137891363352537, + "reward_before_mean": -0.004143683239817619, + "reward_before_std": 0.3665550462901592, + "reward_change_max": 0.0, + "reward_change_mean": -0.2928943391889334, + "reward_change_min": -0.4820845164358616, + "reward_change_std": 0.18245024606585503, + "reward_std": 0.4137891549617052, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/cosine_scaled_reward": -0.1291436767205596, + "step": 294 + }, + { + "clip_fraction": 0.0, + "completion_length": 2967.1250076293945, + "epoch": 0.33714285714285713, + "grad_norm": 0.03346420079469681, + "kl": 0.00022870302200317383, + "lambda_div_used": 0.5998342111706734, + "learning_rate": 4.904846243842949e-07, + "loss": 0.0317, + "reward": -0.21130692400038242, + "reward_after_mean": -0.21130692400038242, + "reward_after_std": 0.4944173116236925, + "reward_before_mean": 0.07879196340218186, + "reward_before_std": 0.4600360617041588, + "reward_change_max": 0.0, + "reward_change_mean": -0.2900988757610321, + "reward_change_min": -0.47371478378772736, + "reward_change_std": 0.18294072337448597, + "reward_std": 0.49441731721162796, + "rewards/accuracy_reward": 0.16666666977107525, + "rewards/cosine_scaled_reward": -0.08787472359836102, + "step": 295 + }, + { + "clip_fraction": 0.0, + "completion_length": 2934.3541870117188, + "epoch": 0.3382857142857143, + "grad_norm": 0.024312211200594902, + "kl": 0.00023806095123291016, + "lambda_div_used": 0.5608572289347649, + "learning_rate": 4.873721045679706e-07, + "loss": 0.0627, + "reward": -0.18713558092713356, + "reward_after_mean": -0.18713558092713356, + "reward_after_std": 0.37113036401569843, + "reward_before_mean": 0.2144376989454031, + "reward_before_std": 0.2741664042696357, + "reward_change_max": 0.0, + "reward_change_mean": -0.40157328359782696, + "reward_change_min": -0.5742517001926899, + "reward_change_std": 0.22423129715025425, + "reward_std": 0.3711303863674402, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/cosine_scaled_reward": -0.01472897082567215, + "step": 296 + }, + { + "clip_fraction": 0.0, + "completion_length": 3350.8958435058594, + "epoch": 0.3394285714285714, + "grad_norm": 0.016911419108510017, + "kl": 0.0002732276916503906, + "lambda_div_used": 0.6085675731301308, + "learning_rate": 4.842626371469149e-07, + "loss": -0.0001, + "reward": -0.13465989474207163, + "reward_after_mean": -0.13465989474207163, + "reward_after_std": 0.5554658677428961, + "reward_before_mean": 0.1848100395873189, + "reward_before_std": 0.494691526517272, + "reward_change_max": 0.0, + "reward_change_mean": -0.31946992687880993, + "reward_change_min": -0.5050319209694862, + "reward_change_std": 0.18696323037147522, + "reward_std": 0.5554658826440573, + "rewards/accuracy_reward": 0.2291666679084301, + "rewards/cosine_scaled_reward": -0.04435660713352263, + "step": 297 + }, + { + "clip_fraction": 0.0, + "completion_length": 2694.791748046875, + "epoch": 0.3405714285714286, + "grad_norm": 0.022230952978134155, + "kl": 0.0001736283302307129, + "lambda_div_used": 0.585621178150177, + "learning_rate": 4.811563736721829e-07, + "loss": -0.0641, + "reward": -0.09131219866685569, + "reward_after_mean": -0.09131219866685569, + "reward_after_std": 0.521870668977499, + "reward_before_mean": 0.32360453344881535, + "reward_before_std": 0.3913488043472171, + "reward_change_max": 0.0, + "reward_change_mean": -0.41491674818098545, + "reward_change_min": -0.6092600487172604, + "reward_change_std": 0.22935225442051888, + "reward_std": 0.5218706801533699, + "rewards/accuracy_reward": 0.31250000186264515, + "rewards/cosine_scaled_reward": 0.011104530887678266, + "step": 298 + }, + { + "clip_fraction": 0.0, + "completion_length": 3055.562530517578, + "epoch": 0.3417142857142857, + "grad_norm": 0.016956690698862076, + "kl": 0.00024694204330444336, + "lambda_div_used": 0.6225644424557686, + "learning_rate": 4.780534655386743e-07, + "loss": -0.0092, + "reward": 0.035534653812646866, + "reward_after_mean": 0.035534653812646866, + "reward_after_std": 0.5773687828332186, + "reward_before_mean": 0.4040503818541765, + "reward_before_std": 0.5686829779297113, + "reward_change_max": 0.0, + "reward_change_mean": -0.3685157597064972, + "reward_change_min": -0.6160638965666294, + "reward_change_std": 0.24328476376831532, + "reward_std": 0.5773687846958637, + "rewards/accuracy_reward": 0.3125000111758709, + "rewards/cosine_scaled_reward": 0.09155038185417652, + "step": 299 + }, + { + "clip_fraction": 0.0, + "completion_length": 3382.041717529297, + "epoch": 0.34285714285714286, + "grad_norm": 0.018982913345098495, + "kl": 0.00028121471405029297, + "lambda_div_used": 0.5635487735271454, + "learning_rate": 4.749540639777539e-07, + "loss": 0.0416, + "reward": -0.35363302007317543, + "reward_after_mean": -0.35363302007317543, + "reward_after_std": 0.3348991144448519, + "reward_before_mean": -0.05836603417992592, + "reward_before_std": 0.2893520062789321, + "reward_change_max": 0.0, + "reward_change_mean": -0.2952669896185398, + "reward_change_min": -0.45902693271636963, + "reward_change_std": 0.17492949962615967, + "reward_std": 0.33489912562072277, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/cosine_scaled_reward": -0.14169937558472157, + "step": 300 + }, + { + "clip_fraction": 0.0, + "completion_length": 2189.666732788086, + "epoch": 0.344, + "grad_norm": 0.02743070013821125, + "kl": 0.00020772218704223633, + "lambda_div_used": 0.6162105649709702, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.077, + "reward": -0.2270398661494255, + "reward_after_mean": -0.2270398661494255, + "reward_after_std": 0.5807012170553207, + "reward_before_mean": 0.03607312589883804, + "reward_before_std": 0.5330064725130796, + "reward_change_max": 0.0, + "reward_change_mean": -0.263112997636199, + "reward_change_min": -0.46500347927212715, + "reward_change_std": 0.16810790356248617, + "reward_std": 0.5807012394070625, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/cosine_scaled_reward": -0.10976020619273186, + "step": 301 + }, + { + "clip_fraction": 0.0, + "completion_length": 2271.9167098999023, + "epoch": 0.34514285714285714, + "grad_norm": 0.03009362705051899, + "kl": 0.00019848346710205078, + "lambda_div_used": 0.6102809309959412, + "learning_rate": 4.68766384637248e-07, + "loss": -0.0061, + "reward": 0.1712653641588986, + "reward_after_mean": 0.1712653641588986, + "reward_after_std": 0.6374101359397173, + "reward_before_mean": 0.6755810640752316, + "reward_before_std": 0.5059886500239372, + "reward_change_max": 0.0, + "reward_change_mean": -0.5043157208710909, + "reward_change_min": -0.7193296477198601, + "reward_change_std": 0.2849651984870434, + "reward_std": 0.637410145252943, + "rewards/accuracy_reward": 0.4583333395421505, + "rewards/cosine_scaled_reward": 0.21724772220477462, + "step": 302 + }, + { + "clip_fraction": 0.0, + "completion_length": 2379.8125381469727, + "epoch": 0.3462857142857143, + "grad_norm": 0.028283070772886276, + "kl": 0.00022455304861068726, + "lambda_div_used": 0.5937864035367966, + "learning_rate": 4.656784084364238e-07, + "loss": -0.0542, + "reward": -0.11667206266429275, + "reward_after_mean": -0.11667206266429275, + "reward_after_std": 0.46380676329135895, + "reward_before_mean": 0.24170983396470547, + "reward_before_std": 0.4273997135460377, + "reward_change_max": 0.0, + "reward_change_mean": -0.3583819102495909, + "reward_change_min": -0.5776236318051815, + "reward_change_std": 0.22200697474181652, + "reward_std": 0.4638067800551653, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/cosine_scaled_reward": -0.008290180005133152, + "step": 303 + }, + { + "clip_fraction": 0.0, + "completion_length": 2483.270881652832, + "epoch": 0.3474285714285714, + "grad_norm": 0.02691042050719261, + "kl": 0.0002352595329284668, + "lambda_div_used": 0.6489763781428337, + "learning_rate": 4.6259454195101267e-07, + "loss": -0.0177, + "reward": -0.052505167201161385, + "reward_after_mean": -0.052505167201161385, + "reward_after_std": 0.7108322139829397, + "reward_before_mean": 0.2283354545943439, + "reward_before_std": 0.6949998550117016, + "reward_change_max": 0.0, + "reward_change_mean": -0.28084064833819866, + "reward_change_min": -0.5053656212985516, + "reward_change_std": 0.1956273503601551, + "reward_std": 0.7108322307467461, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/cosine_scaled_reward": -0.00083120446652174, + "step": 304 + }, + { + "clip_fraction": 0.0, + "completion_length": 2814.229232788086, + "epoch": 0.3485714285714286, + "grad_norm": 0.023387128487229347, + "kl": 0.0002447366714477539, + "lambda_div_used": 0.5986855253577232, + "learning_rate": 4.59514935484316e-07, + "loss": -0.0237, + "reward": -0.17212717607617378, + "reward_after_mean": -0.17212717607617378, + "reward_after_std": 0.5009447801858187, + "reward_before_mean": 0.13333414122462273, + "reward_before_std": 0.45383385568857193, + "reward_change_max": 0.0, + "reward_change_mean": -0.30546131171286106, + "reward_change_min": -0.460513886064291, + "reward_change_std": 0.1838802546262741, + "reward_std": 0.5009447950869799, + "rewards/accuracy_reward": 0.20833334140479565, + "rewards/cosine_scaled_reward": -0.07499920274131, + "step": 305 + }, + { + "clip_fraction": 0.0, + "completion_length": 2095.0000190734863, + "epoch": 0.3497142857142857, + "grad_norm": 0.03199386969208717, + "kl": 0.00018633902072906494, + "lambda_div_used": 0.595753937959671, + "learning_rate": 4.5643973913200837e-07, + "loss": 0.0245, + "reward": 0.03899537643883377, + "reward_after_mean": 0.03899537643883377, + "reward_after_std": 0.528584310784936, + "reward_before_mean": 0.49805059214122593, + "reward_before_std": 0.4409319751430303, + "reward_change_max": 0.0, + "reward_change_mean": -0.45905524492263794, + "reward_change_min": -0.6990172192454338, + "reward_change_std": 0.2754313191398978, + "reward_std": 0.528584323823452, + "rewards/accuracy_reward": 0.3541666753590107, + "rewards/cosine_scaled_reward": 0.14388393727131188, + "step": 306 + }, + { + "clip_fraction": 0.0, + "completion_length": 2299.4166984558105, + "epoch": 0.35085714285714287, + "grad_norm": 0.030135195702314377, + "kl": 0.00022998452186584473, + "lambda_div_used": 0.6275613307952881, + "learning_rate": 4.5336910277482155e-07, + "loss": 0.024, + "reward": -0.1547635430470109, + "reward_after_mean": -0.1547635430470109, + "reward_after_std": 0.6599444597959518, + "reward_before_mean": 0.11526766640599817, + "reward_before_std": 0.586240291595459, + "reward_change_max": 0.0, + "reward_change_mean": -0.270031226798892, + "reward_change_min": -0.40235158428549767, + "reward_change_std": 0.15426483657211065, + "reward_std": 0.6599444709718227, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/cosine_scaled_reward": -0.05139899626374245, + "step": 307 + }, + { + "clip_fraction": 0.0, + "completion_length": 3160.000030517578, + "epoch": 0.352, + "grad_norm": 0.018670011311769485, + "kl": 0.00022402405738830566, + "lambda_div_used": 0.6338188126683235, + "learning_rate": 4.503031760712397e-07, + "loss": -0.0036, + "reward": -0.02171214483678341, + "reward_after_mean": -0.02171214483678341, + "reward_after_std": 0.6237165722995996, + "reward_before_mean": 0.29556242609396577, + "reward_before_std": 0.6284392019733787, + "reward_change_max": 0.0, + "reward_change_mean": -0.3172745667397976, + "reward_change_min": -0.5880400538444519, + "reward_change_std": 0.227503115311265, + "reward_std": 0.6237165946513414, + "rewards/accuracy_reward": 0.2708333395421505, + "rewards/cosine_scaled_reward": 0.024729080265387893, + "step": 308 + }, + { + "clip_fraction": 0.0, + "completion_length": 2601.9583740234375, + "epoch": 0.35314285714285715, + "grad_norm": 0.02388334833085537, + "kl": 0.00020259618759155273, + "lambda_div_used": 0.645737886428833, + "learning_rate": 4.4724210845020494e-07, + "loss": 0.0479, + "reward": 0.09015273489058018, + "reward_after_mean": 0.09015273489058018, + "reward_after_std": 0.6656354945152998, + "reward_before_mean": 0.4416396114975214, + "reward_before_std": 0.6783724967390299, + "reward_change_max": 0.0, + "reward_change_mean": -0.3514868915081024, + "reward_change_min": -0.5935809202492237, + "reward_change_std": 0.24138008058071136, + "reward_std": 0.6656355243176222, + "rewards/accuracy_reward": 0.33333334140479565, + "rewards/cosine_scaled_reward": 0.10830628499388695, + "step": 309 + }, + { + "clip_fraction": 0.0, + "completion_length": 2018.3542022705078, + "epoch": 0.35428571428571426, + "grad_norm": 0.030827393755316734, + "kl": 0.00020694732666015625, + "lambda_div_used": 0.6246318891644478, + "learning_rate": 4.441860491038345e-07, + "loss": 0.0364, + "reward": -0.13591936416924, + "reward_after_mean": -0.13591936416924, + "reward_after_std": 0.590507235378027, + "reward_before_mean": 0.14517710404470563, + "reward_before_std": 0.5790550196543336, + "reward_change_max": 0.0, + "reward_change_mean": -0.28109647892415524, + "reward_change_min": -0.5089371241629124, + "reward_change_std": 0.1950351819396019, + "reward_std": 0.5905072540044785, + "rewards/accuracy_reward": 0.20833333767950535, + "rewards/cosine_scaled_reward": -0.06315623363479972, + "step": 310 + }, + { + "clip_fraction": 0.0, + "completion_length": 2212.083335876465, + "epoch": 0.3554285714285714, + "grad_norm": 0.025455351918935776, + "kl": 0.00022774934768676758, + "lambda_div_used": 0.5723904073238373, + "learning_rate": 4.4113514698014953e-07, + "loss": -0.0105, + "reward": 0.04124009236693382, + "reward_after_mean": 0.04124009236693382, + "reward_after_std": 0.47586701065301895, + "reward_before_mean": 0.5763039644807577, + "reward_before_std": 0.3296704487875104, + "reward_change_max": 0.0, + "reward_change_mean": -0.5350638851523399, + "reward_change_min": -0.7587380260229111, + "reward_change_std": 0.295849135145545, + "reward_std": 0.47586701437830925, + "rewards/accuracy_reward": 0.3958333395421505, + "rewards/cosine_scaled_reward": 0.18047063890844584, + "step": 311 + }, + { + "clip_fraction": 0.0, + "completion_length": 2184.833351135254, + "epoch": 0.3565714285714286, + "grad_norm": 0.030652416869997978, + "kl": 0.0002588629722595215, + "lambda_div_used": 0.5729342699050903, + "learning_rate": 4.3808955077581546e-07, + "loss": -0.0417, + "reward": 0.07517872378230095, + "reward_after_mean": 0.07517872378230095, + "reward_after_std": 0.4695996157824993, + "reward_before_mean": 0.596075527369976, + "reward_before_std": 0.333228693343699, + "reward_change_max": 0.0, + "reward_change_mean": -0.5208968166261911, + "reward_change_min": -0.7326644062995911, + "reward_change_std": 0.28812805004417896, + "reward_std": 0.46959962509572506, + "rewards/accuracy_reward": 0.4375000074505806, + "rewards/cosine_scaled_reward": 0.15857553109526634, + "step": 312 + }, + { + "clip_fraction": 0.0, + "completion_length": 2586.1875534057617, + "epoch": 0.3577142857142857, + "grad_norm": 0.022464681416749954, + "kl": 0.00023896992206573486, + "lambda_div_used": 0.5898676738142967, + "learning_rate": 4.350494089288943e-07, + "loss": 0.0132, + "reward": 0.004749574698507786, + "reward_after_mean": 0.004749574698507786, + "reward_after_std": 0.5146205350756645, + "reward_before_mean": 0.4418492801487446, + "reward_before_std": 0.41057482920587063, + "reward_change_max": 0.0, + "reward_change_mean": -0.4370996989309788, + "reward_change_min": -0.6041509285569191, + "reward_change_std": 0.24443841353058815, + "reward_std": 0.5146205425262451, + "rewards/accuracy_reward": 0.35416667722165585, + "rewards/cosine_scaled_reward": 0.08768259733915329, + "step": 313 + }, + { + "clip_fraction": 0.0, + "completion_length": 2277.68754196167, + "epoch": 0.3588571428571429, + "grad_norm": 0.029802966862916946, + "kl": 0.00015804357826709747, + "lambda_div_used": 0.57961256057024, + "learning_rate": 4.3201486961161093e-07, + "loss": -0.0058, + "reward": 0.04087065905332565, + "reward_after_mean": 0.04087065905332565, + "reward_after_std": 0.491618013009429, + "reward_before_mean": 0.5418765433132648, + "reward_before_std": 0.3644657013937831, + "reward_change_max": 0.0, + "reward_change_mean": -0.5010058786720037, + "reward_change_min": -0.7266513183712959, + "reward_change_std": 0.2839649748057127, + "reward_std": 0.4916180297732353, + "rewards/accuracy_reward": 0.3958333358168602, + "rewards/cosine_scaled_reward": 0.14604321867227554, + "step": 314 + }, + { + "clip_fraction": 0.0, + "completion_length": 2722.479179382324, + "epoch": 0.36, + "grad_norm": 0.02163972705602646, + "kl": 0.00025177001953125, + "lambda_div_used": 0.5554845109581947, + "learning_rate": 4.2898608072313045e-07, + "loss": 0.0302, + "reward": -0.1392173320055008, + "reward_after_mean": -0.1392173320055008, + "reward_after_std": 0.421754639595747, + "reward_before_mean": 0.3322529550641775, + "reward_before_std": 0.2525685231667012, + "reward_change_max": 0.0, + "reward_change_mean": -0.4714703354984522, + "reward_change_min": -0.6779767945408821, + "reward_change_std": 0.2537938868626952, + "reward_std": 0.4217546433210373, + "rewards/accuracy_reward": 0.2916666679084301, + "rewards/cosine_scaled_reward": 0.04058632627129555, + "step": 315 + }, + { + "clip_fraction": 0.0, + "completion_length": 3430.4166870117188, + "epoch": 0.36114285714285715, + "grad_norm": 0.021473117172718048, + "kl": 0.00034546852111816406, + "lambda_div_used": 0.5831886008381844, + "learning_rate": 4.2596318988235037e-07, + "loss": 0.0331, + "reward": -0.20306292921304703, + "reward_after_mean": -0.20306292921304703, + "reward_after_std": 0.4045538082718849, + "reward_before_mean": 0.12524887174367905, + "reward_before_std": 0.37981976941227913, + "reward_change_max": 0.0, + "reward_change_mean": -0.3283117860555649, + "reward_change_min": -0.5172865837812424, + "reward_change_std": 0.20252829603850842, + "reward_std": 0.40455381385982037, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/cosine_scaled_reward": -0.062251146882772446, + "step": 316 + }, + { + "clip_fraction": 0.0, + "completion_length": 2792.6250076293945, + "epoch": 0.36228571428571427, + "grad_norm": 0.023409582674503326, + "kl": 0.00030331313610076904, + "lambda_div_used": 0.5606855005025864, + "learning_rate": 4.2294634442070553e-07, + "loss": 0.0162, + "reward": -0.25401476211845875, + "reward_after_mean": -0.25401476211845875, + "reward_after_std": 0.39204780384898186, + "reward_before_mean": 0.13540820218622684, + "reward_before_std": 0.2758419858291745, + "reward_change_max": 0.0, + "reward_change_mean": -0.38942296989262104, + "reward_change_min": -0.5835930369794369, + "reward_change_std": 0.21769424341619015, + "reward_std": 0.39204781129956245, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/cosine_scaled_reward": -0.03125846944749355, + "step": 317 + }, + { + "clip_fraction": 0.0, + "completion_length": 1197.1875228881836, + "epoch": 0.36342857142857143, + "grad_norm": 0.044679004698991776, + "kl": 0.00012731552124023438, + "lambda_div_used": 0.6002074480056763, + "learning_rate": 4.1993569137498776e-07, + "loss": -0.0198, + "reward": 0.03343228530138731, + "reward_after_mean": 0.03343228530138731, + "reward_after_std": 0.5209389794617891, + "reward_before_mean": 0.4655236080288887, + "reward_before_std": 0.45914868731051683, + "reward_change_max": 0.0, + "reward_change_mean": -0.4320913068950176, + "reward_change_min": -0.6742168106138706, + "reward_change_std": 0.2611297369003296, + "reward_std": 0.5209389794617891, + "rewards/accuracy_reward": 0.3541666753590107, + "rewards/cosine_scaled_reward": 0.1113569182343781, + "step": 318 + }, + { + "clip_fraction": 0.0, + "completion_length": 2743.4167098999023, + "epoch": 0.36457142857142855, + "grad_norm": 0.03044748492538929, + "kl": 0.0002751946449279785, + "lambda_div_used": 0.5535945892333984, + "learning_rate": 4.1693137748017915e-07, + "loss": 0.0111, + "reward": -0.4908355651423335, + "reward_after_mean": -0.4908355651423335, + "reward_after_std": 0.3172128964215517, + "reward_before_mean": -0.24739529378712177, + "reward_before_std": 0.23890432622283697, + "reward_change_max": 0.0, + "reward_change_mean": -0.2434402648359537, + "reward_change_min": -0.35953374207019806, + "reward_change_std": 0.13119421433657408, + "reward_std": 0.31721290200948715, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/cosine_scaled_reward": -0.2682286258786917, + "step": 319 + }, + { + "clip_fraction": 0.0, + "completion_length": 1857.3333587646484, + "epoch": 0.3657142857142857, + "grad_norm": 0.03547307848930359, + "kl": 0.00026220083236694336, + "lambda_div_used": 0.6022170931100845, + "learning_rate": 4.1393354916230005e-07, + "loss": 0.0462, + "reward": -0.125480268150568, + "reward_after_mean": -0.125480268150568, + "reward_after_std": 0.505291972309351, + "reward_before_mean": 0.19450905406847596, + "reward_before_std": 0.4768552405294031, + "reward_change_max": 0.0, + "reward_change_mean": -0.31998929381370544, + "reward_change_min": -0.5316651687026024, + "reward_change_std": 0.20880178455263376, + "reward_std": 0.505291985347867, + "rewards/accuracy_reward": 0.2291666753590107, + "rewards/cosine_scaled_reward": -0.034657632233574986, + "step": 320 + }, + { + "clip_fraction": 0.0, + "completion_length": 1452.770881652832, + "epoch": 0.3668571428571429, + "grad_norm": 0.03124224953353405, + "kl": 0.00018140673637390137, + "lambda_div_used": 0.6199431717395782, + "learning_rate": 4.1094235253127374e-07, + "loss": -0.0599, + "reward": 0.282286923378706, + "reward_after_mean": 0.282286923378706, + "reward_after_std": 0.6488686576485634, + "reward_before_mean": 0.8242554701864719, + "reward_before_std": 0.5566414860077202, + "reward_change_max": 0.0, + "reward_change_mean": -0.5419685430824757, + "reward_change_min": -0.8236861452460289, + "reward_change_std": 0.33083922043442726, + "reward_std": 0.648868665099144, + "rewards/accuracy_reward": 0.5625000074505806, + "rewards/cosine_scaled_reward": 0.26175545156002045, + "step": 321 + }, + { + "clip_fraction": 0.0, + "completion_length": 2662.50004196167, + "epoch": 0.368, + "grad_norm": 0.036272305995225906, + "kl": 0.0003364086151123047, + "lambda_div_used": 0.6552734896540642, + "learning_rate": 4.079579333738039e-07, + "loss": 0.0123, + "reward": -0.039356768131256104, + "reward_after_mean": -0.039356768131256104, + "reward_after_std": 0.7184183727949858, + "reward_before_mean": 0.23175650835037231, + "reward_before_std": 0.7305669207125902, + "reward_change_max": 0.0, + "reward_change_mean": -0.2711132802069187, + "reward_change_min": -0.5112780928611755, + "reward_change_std": 0.20828023366630077, + "reward_std": 0.7184183821082115, + "rewards/accuracy_reward": 0.27083333395421505, + "rewards/cosine_scaled_reward": -0.03907683305442333, + "step": 322 + }, + { + "clip_fraction": 0.0, + "completion_length": 2875.937530517578, + "epoch": 0.36914285714285716, + "grad_norm": 0.023642728105187416, + "kl": 0.0003116130828857422, + "lambda_div_used": 0.5762533918023109, + "learning_rate": 4.0498043714627006e-07, + "loss": -0.0062, + "reward": -0.18317513819783926, + "reward_after_mean": -0.18317513819783926, + "reward_after_std": 0.45206453651189804, + "reward_before_mean": 0.1913843434303999, + "reward_before_std": 0.34759796876460314, + "reward_change_max": 0.0, + "reward_change_mean": -0.3745594993233681, + "reward_change_min": -0.5339466538280249, + "reward_change_std": 0.2068865867331624, + "reward_std": 0.4520645458251238, + "rewards/accuracy_reward": 0.22916666977107525, + "rewards/cosine_scaled_reward": -0.037782331462949514, + "step": 323 + }, + { + "clip_fraction": 0.0, + "completion_length": 2517.541732788086, + "epoch": 0.3702857142857143, + "grad_norm": 0.02370315045118332, + "kl": 0.0002675652503967285, + "lambda_div_used": 0.582614079117775, + "learning_rate": 4.020100089676376e-07, + "loss": 0.0264, + "reward": -0.09797720052301884, + "reward_after_mean": -0.09797720052301884, + "reward_after_std": 0.46359810046851635, + "reward_before_mean": 0.30637714080512524, + "reward_before_std": 0.37786578573286533, + "reward_change_max": 0.0, + "reward_change_mean": -0.4043543320149183, + "reward_change_min": -0.6221220158040524, + "reward_change_std": 0.2347763581201434, + "reward_std": 0.46359810046851635, + "rewards/accuracy_reward": 0.2708333395421505, + "rewards/cosine_scaled_reward": 0.03554379381239414, + "step": 324 + }, + { + "clip_fraction": 0.0, + "completion_length": 2340.104232788086, + "epoch": 0.37142857142857144, + "grad_norm": 0.02759523130953312, + "kl": 0.0002740621566772461, + "lambda_div_used": 0.6287032291293144, + "learning_rate": 3.9904679361238526e-07, + "loss": 0.0386, + "reward": 0.3422952927649021, + "reward_after_mean": 0.3422952927649021, + "reward_after_std": 0.6593811456114054, + "reward_before_mean": 0.8776058983057737, + "reward_before_std": 0.5943253133445978, + "reward_change_max": 0.0, + "reward_change_mean": -0.5353106111288071, + "reward_change_min": -0.8339854516088963, + "reward_change_std": 0.32581204548478127, + "reward_std": 0.6593811772763729, + "rewards/accuracy_reward": 0.5416666753590107, + "rewards/cosine_scaled_reward": 0.335939209908247, + "step": 325 + }, + { + "clip_fraction": 0.0, + "completion_length": 2069.0208740234375, + "epoch": 0.37257142857142855, + "grad_norm": 0.026782048866152763, + "kl": 0.00021858513355255127, + "lambda_div_used": 0.5514643862843513, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0214, + "reward": -0.2377523072063923, + "reward_after_mean": -0.2377523072063923, + "reward_after_std": 0.3420621510595083, + "reward_before_mean": 0.17398178996518254, + "reward_before_std": 0.2309601987944916, + "reward_change_max": 0.0, + "reward_change_mean": -0.41173411533236504, + "reward_change_min": -0.586280532181263, + "reward_change_std": 0.22657191008329391, + "reward_std": 0.3420621529221535, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/cosine_scaled_reward": -0.034351545851677656, + "step": 326 + }, + { + "clip_fraction": 0.0, + "completion_length": 2595.583339691162, + "epoch": 0.3737142857142857, + "grad_norm": 0.023073619231581688, + "kl": 0.0003046169877052307, + "lambda_div_used": 0.5742950737476349, + "learning_rate": 3.931425787051832e-07, + "loss": 0.016, + "reward": -0.06984845921397209, + "reward_after_mean": -0.06984845921397209, + "reward_after_std": 0.4958471246063709, + "reward_before_mean": 0.39507998805493116, + "reward_before_std": 0.33821228239685297, + "reward_change_max": 0.0, + "reward_change_mean": -0.46492844074964523, + "reward_change_min": -0.6655924804508686, + "reward_change_std": 0.25432714726775885, + "reward_std": 0.49584713764488697, + "rewards/accuracy_reward": 0.33333333395421505, + "rewards/cosine_scaled_reward": 0.06174664665013552, + "step": 327 + }, + { + "clip_fraction": 0.0, + "completion_length": 3254.375030517578, + "epoch": 0.37485714285714283, + "grad_norm": 0.02187371626496315, + "kl": 0.0003033876419067383, + "lambda_div_used": 0.6039082854986191, + "learning_rate": 3.902018669163384e-07, + "loss": 0.0465, + "reward": -0.22737010568380356, + "reward_after_mean": -0.22737010568380356, + "reward_after_std": 0.5139794517308474, + "reward_before_mean": 0.057241520611569285, + "reward_before_std": 0.47974141500890255, + "reward_change_max": 0.0, + "reward_change_mean": -0.28461163491010666, + "reward_change_min": -0.4783761650323868, + "reward_change_std": 0.18032598588615656, + "reward_std": 0.5139794517308474, + "rewards/accuracy_reward": 0.16666667349636555, + "rewards/cosine_scaled_reward": -0.10942514054477215, + "step": 328 + }, + { + "clip_fraction": 0.0, + "completion_length": 1712.7500305175781, + "epoch": 0.376, + "grad_norm": 0.030909525230526924, + "kl": 0.00020560622215270996, + "lambda_div_used": 0.5851015225052834, + "learning_rate": 3.872689434630585e-07, + "loss": 0.0147, + "reward": -0.0780089907348156, + "reward_after_mean": -0.0780089907348156, + "reward_after_std": 0.4830572586506605, + "reward_before_mean": 0.3372959513217211, + "reward_before_std": 0.39714881777763367, + "reward_change_max": 0.0, + "reward_change_mean": -0.415304945781827, + "reward_change_min": -0.6223411783576012, + "reward_change_std": 0.24614232685416937, + "reward_std": 0.48305728659033775, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/cosine_scaled_reward": 0.06646260246634483, + "step": 329 + }, + { + "clip_fraction": 0.0, + "completion_length": 2283.437515258789, + "epoch": 0.37714285714285717, + "grad_norm": 0.03479500487446785, + "kl": 0.00036913156509399414, + "lambda_div_used": 0.629969134926796, + "learning_rate": 3.843439512918949e-07, + "loss": -0.0771, + "reward": -0.08872065320611, + "reward_after_mean": -0.08872065320611, + "reward_after_std": 0.6100571732968092, + "reward_before_mean": 0.20380639098584652, + "reward_before_std": 0.606320459395647, + "reward_change_max": 0.0, + "reward_change_mean": -0.2925270590931177, + "reward_change_min": -0.5179039165377617, + "reward_change_std": 0.20588424988090992, + "reward_std": 0.6100572124123573, + "rewards/accuracy_reward": 0.22916666977107525, + "rewards/cosine_scaled_reward": -0.02536027878522873, + "step": 330 + }, + { + "clip_fraction": 0.0, + "completion_length": 2365.354200363159, + "epoch": 0.3782857142857143, + "grad_norm": 0.05943391099572182, + "kl": 0.0003091096878051758, + "lambda_div_used": 0.6346529722213745, + "learning_rate": 3.8142703296283953e-07, + "loss": -0.0337, + "reward": -0.24408827535808086, + "reward_after_mean": -0.24408827535808086, + "reward_after_std": 0.6825795099139214, + "reward_before_mean": -0.026182920671999454, + "reward_before_std": 0.6199203189462423, + "reward_change_max": 0.0, + "reward_change_mean": -0.21790535561740398, + "reward_change_min": -0.3899071477353573, + "reward_change_std": 0.13575981836766005, + "reward_std": 0.6825795285403728, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/cosine_scaled_reward": -0.1511829246301204, + "step": 331 + }, + { + "clip_fraction": 0.0, + "completion_length": 2488.937530517578, + "epoch": 0.37942857142857145, + "grad_norm": 0.02497093193233013, + "kl": 0.00024446845054626465, + "lambda_div_used": 0.628095343708992, + "learning_rate": 3.785183306423767e-07, + "loss": 0.002, + "reward": -0.0888600671896711, + "reward_after_mean": -0.0888600671896711, + "reward_after_std": 0.634421993046999, + "reward_before_mean": 0.22087145410478115, + "reward_before_std": 0.5863482365384698, + "reward_change_max": 0.0, + "reward_change_mean": -0.30973155051469803, + "reward_change_min": -0.5146533660590649, + "reward_change_std": 0.19153987523168325, + "reward_std": 0.6344220079481602, + "rewards/accuracy_reward": 0.2291666753590107, + "rewards/cosine_scaled_reward": -0.00829521007835865, + "step": 332 + }, + { + "clip_fraction": 0.0, + "completion_length": 1976.333381652832, + "epoch": 0.38057142857142856, + "grad_norm": 0.029570044949650764, + "kl": 0.00023859739303588867, + "lambda_div_used": 0.5713351741433144, + "learning_rate": 3.7561798609655373e-07, + "loss": 0.026, + "reward": 0.11462849378585815, + "reward_after_mean": 0.11462849378585815, + "reward_after_std": 0.5290882792323828, + "reward_before_mean": 0.6867042146623135, + "reward_before_std": 0.3248422802425921, + "reward_change_max": 0.0, + "reward_change_mean": -0.5720757059752941, + "reward_change_min": -0.7701217532157898, + "reward_change_std": 0.30192676838487387, + "reward_std": 0.5290882866829634, + "rewards/accuracy_reward": 0.47916666977107525, + "rewards/cosine_scaled_reward": 0.20753752067685127, + "step": 333 + }, + { + "clip_fraction": 0.0, + "completion_length": 2970.2708892822266, + "epoch": 0.38171428571428573, + "grad_norm": 0.022143961861729622, + "kl": 0.0002802610397338867, + "lambda_div_used": 0.5814503356814384, + "learning_rate": 3.72726140684072e-07, + "loss": -0.0083, + "reward": -0.36110448837280273, + "reward_after_mean": -0.36110448837280273, + "reward_after_std": 0.42284002527594566, + "reward_before_mean": -0.09825330413877964, + "reward_before_std": 0.3718461263924837, + "reward_change_max": 0.0, + "reward_change_mean": -0.26285118237137794, + "reward_change_min": -0.4302907735109329, + "reward_change_std": 0.16061531472951174, + "reward_std": 0.42284005135297775, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/cosine_scaled_reward": -0.1815866343677044, + "step": 334 + }, + { + "clip_fraction": 0.0, + "completion_length": 2137.7916717529297, + "epoch": 0.38285714285714284, + "grad_norm": 0.023810893297195435, + "kl": 0.00019755959510803223, + "lambda_div_used": 0.5730342343449593, + "learning_rate": 3.6984293534939737e-07, + "loss": -0.0589, + "reward": -0.003980423090979457, + "reward_after_mean": -0.003980423090979457, + "reward_after_std": 0.4973279498517513, + "reward_before_mean": 0.48682230338454247, + "reward_before_std": 0.3303193561732769, + "reward_change_max": 0.0, + "reward_change_mean": -0.4908027183264494, + "reward_change_min": -0.6631990969181061, + "reward_change_std": 0.25930201914161444, + "reward_std": 0.4973279610276222, + "rewards/accuracy_reward": 0.35416666977107525, + "rewards/cosine_scaled_reward": 0.132655612193048, + "step": 335 + }, + { + "clip_fraction": 0.0, + "completion_length": 2604.5625762939453, + "epoch": 0.384, + "grad_norm": 0.019800275564193726, + "kl": 0.00027495622634887695, + "lambda_div_used": 0.6422952190041542, + "learning_rate": 3.6696851061588994e-07, + "loss": -0.0153, + "reward": 0.1156077766790986, + "reward_after_mean": 0.1156077766790986, + "reward_after_std": 0.7227907460182905, + "reward_before_mean": 0.5011630854569376, + "reward_before_std": 0.6614610198885202, + "reward_change_max": 0.0, + "reward_change_mean": -0.3855553139001131, + "reward_change_min": -0.6121690329164267, + "reward_change_std": 0.23865524679422379, + "reward_std": 0.7227907720953226, + "rewards/accuracy_reward": 0.3750000074505806, + "rewards/cosine_scaled_reward": 0.1261630654335022, + "step": 336 + }, + { + "clip_fraction": 0.0, + "completion_length": 2581.6667404174805, + "epoch": 0.3851428571428571, + "grad_norm": 0.024305738508701324, + "kl": 0.000293731689453125, + "lambda_div_used": 0.5970565602183342, + "learning_rate": 3.641030065789562e-07, + "loss": 0.0618, + "reward": -0.012716710567474365, + "reward_after_mean": -0.012716710567474365, + "reward_after_std": 0.5347360204905272, + "reward_before_mean": 0.4010282773524523, + "reward_before_std": 0.4501037606969476, + "reward_change_max": 0.0, + "reward_change_mean": -0.41374498419463634, + "reward_change_min": -0.597484715282917, + "reward_change_std": 0.24177721049636602, + "reward_std": 0.5347360335290432, + "rewards/accuracy_reward": 0.31250000558793545, + "rewards/cosine_scaled_reward": 0.08852825942449272, + "step": 337 + }, + { + "clip_fraction": 0.0, + "completion_length": 1854.9375381469727, + "epoch": 0.3862857142857143, + "grad_norm": 0.035376228392124176, + "kl": 0.00024819374084472656, + "lambda_div_used": 0.6688028946518898, + "learning_rate": 3.612465628992203e-07, + "loss": 0.1016, + "reward": 0.3372328467667103, + "reward_after_mean": 0.3372328467667103, + "reward_after_std": 0.8120726235210896, + "reward_before_mean": 0.783452745527029, + "reward_before_std": 0.787550600245595, + "reward_change_max": 0.0, + "reward_change_mean": -0.44621986895799637, + "reward_change_min": -0.745157428085804, + "reward_change_std": 0.29823943972587585, + "reward_std": 0.812072642147541, + "rewards/accuracy_reward": 0.500000013038516, + "rewards/cosine_scaled_reward": 0.2834526968654245, + "step": 338 + }, + { + "clip_fraction": 0.0, + "completion_length": 2807.541717529297, + "epoch": 0.38742857142857146, + "grad_norm": 0.024581631645560265, + "kl": 0.0003020763397216797, + "lambda_div_used": 0.559957392513752, + "learning_rate": 3.5839931879571725e-07, + "loss": -0.0589, + "reward": -0.2989091109484434, + "reward_after_mean": -0.2989091109484434, + "reward_after_std": 0.3937798347324133, + "reward_before_mean": 0.06776450201869011, + "reward_before_std": 0.269554709084332, + "reward_change_max": 0.0, + "reward_change_mean": -0.3666736055165529, + "reward_change_min": -0.5128730908036232, + "reward_change_std": 0.19430112652480602, + "reward_std": 0.3937798459082842, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/cosine_scaled_reward": -0.07806883845478296, + "step": 339 + }, + { + "clip_fraction": 0.0, + "completion_length": 2440.437530517578, + "epoch": 0.38857142857142857, + "grad_norm": 0.027588481083512306, + "kl": 0.0002573728561401367, + "lambda_div_used": 0.6044978573918343, + "learning_rate": 3.555614130391079e-07, + "loss": -0.021, + "reward": -0.09739532321691513, + "reward_after_mean": -0.09739532321691513, + "reward_after_std": 0.5066191554069519, + "reward_before_mean": 0.23490323033183813, + "reward_before_std": 0.48513105837628245, + "reward_change_max": 0.0, + "reward_change_mean": -0.33229855448007584, + "reward_change_min": -0.5539730787277222, + "reward_change_std": 0.21668985951691866, + "reward_std": 0.5066191554069519, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/cosine_scaled_reward": 0.005736543796956539, + "step": 340 + }, + { + "clip_fraction": 0.0, + "completion_length": 2306.4583740234375, + "epoch": 0.38971428571428574, + "grad_norm": 0.025286095216870308, + "kl": 0.00023421645164489746, + "lambda_div_used": 0.5724563226103783, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0526, + "reward": 0.04836506303399801, + "reward_after_mean": 0.04836506303399801, + "reward_after_std": 0.4924194272607565, + "reward_before_mean": 0.5868019293993711, + "reward_before_std": 0.33085333183407784, + "reward_change_max": 0.0, + "reward_change_mean": -0.5384368915110826, + "reward_change_min": -0.7688934281468391, + "reward_change_std": 0.29607443511486053, + "reward_std": 0.492419445887208, + "rewards/accuracy_reward": 0.3958333395421505, + "rewards/cosine_scaled_reward": 0.19096860231366009, + "step": 341 + }, + { + "clip_fraction": 0.0, + "completion_length": 2629.291690826416, + "epoch": 0.39085714285714285, + "grad_norm": 0.02702566236257553, + "kl": 0.00029778480529785156, + "lambda_div_used": 0.6460641473531723, + "learning_rate": 3.4991416936678276e-07, + "loss": -0.0077, + "reward": 0.11682657990604639, + "reward_after_mean": 0.11682657990604639, + "reward_after_std": 0.6720432955771685, + "reward_before_mean": 0.4811732564121485, + "reward_before_std": 0.6802498865872622, + "reward_change_max": 0.0, + "reward_change_mean": -0.3643466793000698, + "reward_change_min": -0.6579400822520256, + "reward_change_std": 0.255507318302989, + "reward_std": 0.6720433253794909, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/cosine_scaled_reward": 0.1061732517555356, + "step": 342 + }, + { + "clip_fraction": 0.0, + "completion_length": 2944.5833435058594, + "epoch": 0.392, + "grad_norm": 0.01883111707866192, + "kl": 0.0002968311309814453, + "lambda_div_used": 0.6280755251646042, + "learning_rate": 3.471051066897562e-07, + "loss": 0.0174, + "reward": 0.13441785983741283, + "reward_after_mean": 0.13441785983741283, + "reward_after_std": 0.62815947458148, + "reward_before_mean": 0.5553851053118706, + "reward_before_std": 0.5964916851371527, + "reward_change_max": 0.0, + "reward_change_mean": -0.42096727155148983, + "reward_change_min": -0.6972850449383259, + "reward_change_std": 0.27345132920891047, + "reward_std": 0.6281594894826412, + "rewards/accuracy_reward": 0.4166666753590107, + "rewards/cosine_scaled_reward": 0.13871843740344048, + "step": 343 + }, + { + "clip_fraction": 0.0, + "completion_length": 2095.5833854675293, + "epoch": 0.3931428571428571, + "grad_norm": 0.023183098062872887, + "kl": 0.00022208690643310547, + "lambda_div_used": 0.5810001268982887, + "learning_rate": 3.4430593282358777e-07, + "loss": -0.024, + "reward": 0.11522329319268465, + "reward_after_mean": 0.11522329319268465, + "reward_after_std": 0.561205493286252, + "reward_before_mean": 0.6688053011894226, + "reward_before_std": 0.3688823012635112, + "reward_change_max": 0.0, + "reward_change_mean": -0.5535820256918669, + "reward_change_min": -0.7913318648934364, + "reward_change_std": 0.2952663041651249, + "reward_std": 0.5612055025994778, + "rewards/accuracy_reward": 0.47916666977107525, + "rewards/cosine_scaled_reward": 0.18963862350210547, + "step": 344 + }, + { + "clip_fraction": 0.0, + "completion_length": 2982.229202270508, + "epoch": 0.3942857142857143, + "grad_norm": 0.022071614861488342, + "kl": 0.00032585859298706055, + "lambda_div_used": 0.577229768037796, + "learning_rate": 3.4151678419606233e-07, + "loss": -0.061, + "reward": -0.20981058850884438, + "reward_after_mean": -0.20981058850884438, + "reward_after_std": 0.40093352645635605, + "reward_before_mean": 0.13120032008737326, + "reward_before_std": 0.353534915484488, + "reward_change_max": 0.0, + "reward_change_mean": -0.3410108871757984, + "reward_change_min": -0.5285827927291393, + "reward_change_std": 0.20349892415106297, + "reward_std": 0.40093354508280754, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/cosine_scaled_reward": -0.03546636272221804, + "step": 345 + }, + { + "clip_fraction": 0.0, + "completion_length": 3075.916702270508, + "epoch": 0.3954285714285714, + "grad_norm": 0.018548911437392235, + "kl": 0.0003154873847961426, + "lambda_div_used": 0.5923640578985214, + "learning_rate": 3.387377967463493e-07, + "loss": 0.0031, + "reward": -0.25945382937788963, + "reward_after_mean": -0.25945382937788963, + "reward_after_std": 0.47871536388993263, + "reward_before_mean": 0.022648759186267853, + "reward_before_std": 0.41908061131834984, + "reward_change_max": 0.0, + "reward_change_mean": -0.28210258670151234, + "reward_change_min": -0.4310699477791786, + "reward_change_std": 0.16193275339901447, + "reward_std": 0.4787153732031584, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/cosine_scaled_reward": -0.12318458454683423, + "step": 346 + }, + { + "clip_fraction": 0.0, + "completion_length": 3052.8333740234375, + "epoch": 0.3965714285714286, + "grad_norm": 0.01688998006284237, + "kl": 0.0003132820129394531, + "lambda_div_used": 0.5991570502519608, + "learning_rate": 3.359691059183761e-07, + "loss": 0.0425, + "reward": -0.21558007411658764, + "reward_after_mean": -0.21558007411658764, + "reward_after_std": 0.504334045574069, + "reward_before_mean": 0.08187778666615486, + "reward_before_std": 0.4535634834319353, + "reward_change_max": 0.0, + "reward_change_mean": -0.2974578682333231, + "reward_change_min": -0.48911403492093086, + "reward_change_std": 0.1813967889174819, + "reward_std": 0.5043340623378754, + "rewards/accuracy_reward": 0.16666667349636555, + "rewards/cosine_scaled_reward": -0.08478887472301722, + "step": 347 + }, + { + "clip_fraction": 0.0, + "completion_length": 2511.916679382324, + "epoch": 0.3977142857142857, + "grad_norm": 0.031576935201883316, + "kl": 0.00030159950256347656, + "lambda_div_used": 0.6005090326070786, + "learning_rate": 3.3321084665422803e-07, + "loss": 0.048, + "reward": -0.08813801780343056, + "reward_after_mean": -0.08813801780343056, + "reward_after_std": 0.4803820662200451, + "reward_before_mean": 0.25806165859103203, + "reward_before_std": 0.4620923697948456, + "reward_change_max": 0.0, + "reward_change_mean": -0.3461996652185917, + "reward_change_min": -0.547101479023695, + "reward_change_std": 0.22022681962698698, + "reward_std": 0.48038206808269024, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/cosine_scaled_reward": 0.008061652071774006, + "step": 348 + }, + { + "clip_fraction": 0.0, + "completion_length": 3160.916717529297, + "epoch": 0.39885714285714285, + "grad_norm": 0.020290188491344452, + "kl": 0.00036334991455078125, + "lambda_div_used": 0.5632076561450958, + "learning_rate": 3.3046315338757026e-07, + "loss": 0.0316, + "reward": -0.26980413869023323, + "reward_after_mean": -0.26980413869023323, + "reward_after_std": 0.3867268729954958, + "reward_before_mean": 0.09292120113968849, + "reward_before_std": 0.2870235051959753, + "reward_change_max": 0.0, + "reward_change_mean": -0.3627253398299217, + "reward_change_min": -0.5702350400388241, + "reward_change_std": 0.20531136635690928, + "reward_std": 0.386726887896657, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/cosine_scaled_reward": -0.0737454742193222, + "step": 349 + }, + { + "clip_fraction": 0.0, + "completion_length": 2213.604202270508, + "epoch": 0.4, + "grad_norm": 0.030296506360173225, + "kl": 0.0002709701657295227, + "lambda_div_used": 0.6561494767665863, + "learning_rate": 3.2772616003709616e-07, + "loss": 0.0154, + "reward": 0.1817268170416355, + "reward_after_mean": 0.1817268170416355, + "reward_after_std": 0.6988476235419512, + "reward_before_mean": 0.5450442042201757, + "reward_before_std": 0.7353325374424458, + "reward_change_max": 0.0, + "reward_change_mean": -0.3633174039423466, + "reward_change_min": -0.6499125882983208, + "reward_change_std": 0.26448090467602015, + "reward_std": 0.6988476365804672, + "rewards/accuracy_reward": 0.3750000074505806, + "rewards/cosine_scaled_reward": 0.17004419304430485, + "step": 350 + }, + { + "clip_fraction": 0.0, + "completion_length": 2509.5208740234375, + "epoch": 0.40114285714285713, + "grad_norm": 0.02334379218518734, + "kl": 0.0002808868885040283, + "lambda_div_used": 0.602841705083847, + "learning_rate": 3.250000000000001e-07, + "loss": 0.0785, + "reward": -0.02256767451763153, + "reward_after_mean": -0.02256767451763153, + "reward_after_std": 0.545713946223259, + "reward_before_mean": 0.3783569000661373, + "reward_before_std": 0.47443881165236235, + "reward_change_max": 0.0, + "reward_change_mean": -0.40092457458376884, + "reward_change_min": -0.5974738858640194, + "reward_change_std": 0.24015377275645733, + "reward_std": 0.5457139611244202, + "rewards/accuracy_reward": 0.31250000558793545, + "rewards/cosine_scaled_reward": 0.06585689261555672, + "step": 351 + }, + { + "clip_fraction": 0.0, + "completion_length": 2237.7917251586914, + "epoch": 0.4022857142857143, + "grad_norm": 0.02218765579164028, + "kl": 0.00025266408920288086, + "lambda_div_used": 0.6070134416222572, + "learning_rate": 3.222848061454764e-07, + "loss": 0.0034, + "reward": -0.07995379093335941, + "reward_after_mean": -0.07995379093335941, + "reward_after_std": 0.5729443337768316, + "reward_before_mean": 0.291741443797946, + "reward_before_std": 0.4979767380282283, + "reward_change_max": 0.0, + "reward_change_mean": -0.3716952446848154, + "reward_change_min": -0.6056977100670338, + "reward_change_std": 0.23109738621860743, + "reward_std": 0.5729443468153477, + "rewards/accuracy_reward": 0.2916666679084301, + "rewards/cosine_scaled_reward": 7.475726306438446e-05, + "step": 352 + }, + { + "clip_fraction": 0.0, + "completion_length": 2214.4583702087402, + "epoch": 0.4034285714285714, + "grad_norm": 0.02660507895052433, + "kl": 0.0002359449863433838, + "lambda_div_used": 0.670557290315628, + "learning_rate": 3.195807108082429e-07, + "loss": -0.0096, + "reward": 0.11138913966715336, + "reward_after_mean": 0.11138913966715336, + "reward_after_std": 0.79796995036304, + "reward_before_mean": 0.420612467918545, + "reward_before_std": 0.7911950433626771, + "reward_change_max": 0.0, + "reward_change_mean": -0.30922332406044006, + "reward_change_min": -0.5478326119482517, + "reward_change_std": 0.21525408141314983, + "reward_std": 0.7979699578136206, + "rewards/accuracy_reward": 0.35416667722165585, + "rewards/cosine_scaled_reward": 0.06644579023122787, + "step": 353 + }, + { + "clip_fraction": 0.0, + "completion_length": 2018.0625228881836, + "epoch": 0.4045714285714286, + "grad_norm": 0.028822243213653564, + "kl": 0.0002269148826599121, + "lambda_div_used": 0.6141867712140083, + "learning_rate": 3.168878457820915e-07, + "loss": 0.0254, + "reward": 0.07253095135092735, + "reward_after_mean": 0.07253095135092735, + "reward_after_std": 0.6088422238826752, + "reward_before_mean": 0.4882249776273966, + "reward_before_std": 0.5237803608179092, + "reward_change_max": 0.0, + "reward_change_mean": -0.4156940244138241, + "reward_change_min": -0.5956555530428886, + "reward_change_std": 0.23860780615359545, + "reward_std": 0.6088422238826752, + "rewards/accuracy_reward": 0.3541666753590107, + "rewards/cosine_scaled_reward": 0.13405831216368824, + "step": 354 + }, + { + "clip_fraction": 0.0, + "completion_length": 2018.0208740234375, + "epoch": 0.4057142857142857, + "grad_norm": 0.029382316395640373, + "kl": 0.00026294589042663574, + "lambda_div_used": 0.5812733992934227, + "learning_rate": 3.142063423134644e-07, + "loss": 0.0145, + "reward": 0.23357930406928062, + "reward_after_mean": 0.23357930406928062, + "reward_after_std": 0.5394825823605061, + "reward_before_mean": 0.8446227628737688, + "reward_before_std": 0.37303208094090223, + "reward_change_max": 0.0, + "reward_change_mean": -0.6110434681177139, + "reward_change_min": -0.8454090058803558, + "reward_change_std": 0.3367054909467697, + "reward_std": 0.5394826009869576, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/cosine_scaled_reward": 0.3029560726135969, + "step": 355 + }, + { + "clip_fraction": 0.0, + "completion_length": 2752.3333740234375, + "epoch": 0.40685714285714286, + "grad_norm": 0.01905817724764347, + "kl": 0.00031578540802001953, + "lambda_div_used": 0.6255660429596901, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0216, + "reward": 0.12779017974389717, + "reward_after_mean": 0.12779017974389717, + "reward_after_std": 0.6442528441548347, + "reward_before_mean": 0.5468167327344418, + "reward_before_std": 0.5852497918531299, + "reward_change_max": 0.0, + "reward_change_mean": -0.41902654618024826, + "reward_change_min": -0.6820830404758453, + "reward_change_std": 0.2657659938558936, + "reward_std": 0.6442528460174799, + "rewards/accuracy_reward": 0.3958333395421505, + "rewards/cosine_scaled_reward": 0.15098336525261402, + "step": 356 + }, + { + "clip_fraction": 0.0, + "completion_length": 2952.520854949951, + "epoch": 0.408, + "grad_norm": 0.024500912055373192, + "kl": 0.0002726316452026367, + "lambda_div_used": 0.573906421661377, + "learning_rate": 3.0887794225945143e-07, + "loss": -0.0555, + "reward": -0.22403784468770027, + "reward_after_mean": -0.22403784468770027, + "reward_after_std": 0.4447880759835243, + "reward_before_mean": 0.14636994618922472, + "reward_before_std": 0.3346911370754242, + "reward_change_max": 0.0, + "reward_change_mean": -0.3704077899456024, + "reward_change_min": -0.5609126053750515, + "reward_change_std": 0.20881207659840584, + "reward_std": 0.4447880797088146, + "rewards/accuracy_reward": 0.18750000186264515, + "rewards/cosine_scaled_reward": -0.04113006801344454, + "step": 357 + }, + { + "clip_fraction": 0.0, + "completion_length": 2411.8541946411133, + "epoch": 0.40914285714285714, + "grad_norm": 0.024526813998818398, + "kl": 0.00026684999465942383, + "lambda_div_used": 0.6140530630946159, + "learning_rate": 3.062313053727671e-07, + "loss": -0.0357, + "reward": 0.30990589410066605, + "reward_after_mean": 0.30990589410066605, + "reward_after_std": 0.5656850170344114, + "reward_before_mean": 0.8357270993292332, + "reward_before_std": 0.5201095007359982, + "reward_change_max": 0.0, + "reward_change_mean": -0.5258211866021156, + "reward_change_min": -0.7649649046361446, + "reward_change_std": 0.3101219357922673, + "reward_std": 0.5656850375235081, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/cosine_scaled_reward": 0.2940603978931904, + "step": 358 + }, + { + "clip_fraction": 0.0, + "completion_length": 2100.604217529297, + "epoch": 0.4102857142857143, + "grad_norm": 0.029383460059762, + "kl": 0.00023573637008666992, + "lambda_div_used": 0.6149207651615143, + "learning_rate": 3.0359654942835247e-07, + "loss": -0.0725, + "reward": -0.014719150494784117, + "reward_after_mean": -0.014719150494784117, + "reward_after_std": 0.6281173154711723, + "reward_before_mean": 0.3637474989518523, + "reward_before_std": 0.5245081130415201, + "reward_change_max": 0.0, + "reward_change_mean": -0.37846665270626545, + "reward_change_min": -0.5382856801152229, + "reward_change_std": 0.20675079058855772, + "reward_std": 0.6281173229217529, + "rewards/accuracy_reward": 0.29166667349636555, + "rewards/cosine_scaled_reward": 0.07208081643329933, + "step": 359 + }, + { + "clip_fraction": 0.0, + "completion_length": 2650.0625610351562, + "epoch": 0.4114285714285714, + "grad_norm": 0.028681648895144463, + "kl": 0.00029480457305908203, + "lambda_div_used": 0.5845082253217697, + "learning_rate": 3.0097380284049523e-07, + "loss": 0.0359, + "reward": 0.034104809165000916, + "reward_after_mean": 0.034104809165000916, + "reward_after_std": 0.5086164381355047, + "reward_before_mean": 0.5177552103996277, + "reward_before_std": 0.3886600947007537, + "reward_change_max": 0.0, + "reward_change_mean": -0.4836503826081753, + "reward_change_min": -0.7048606462776661, + "reward_change_std": 0.27614138927310705, + "reward_std": 0.5086164511740208, + "rewards/accuracy_reward": 0.3958333358168602, + "rewards/cosine_scaled_reward": 0.12192187085747719, + "step": 360 + }, + { + "clip_fraction": 0.0, + "completion_length": 2715.7084197998047, + "epoch": 0.4125714285714286, + "grad_norm": 0.02216268703341484, + "kl": 0.00035099685192108154, + "lambda_div_used": 0.6144327148795128, + "learning_rate": 2.9836319343816397e-07, + "loss": 0.0341, + "reward": 0.11957069113850594, + "reward_after_mean": 0.11957069113850594, + "reward_after_std": 0.5912090875208378, + "reward_before_mean": 0.5671045240014791, + "reward_before_std": 0.523097550496459, + "reward_change_max": 0.0, + "reward_change_mean": -0.4475338254123926, + "reward_change_min": -0.6687514297664165, + "reward_change_std": 0.2680971557274461, + "reward_std": 0.5912090986967087, + "rewards/accuracy_reward": 0.41666667722165585, + "rewards/cosine_scaled_reward": 0.15043783793225884, + "step": 361 + }, + { + "clip_fraction": 0.0, + "completion_length": 1608.958396911621, + "epoch": 0.4137142857142857, + "grad_norm": 0.03410353511571884, + "kl": 0.0002919435501098633, + "lambda_div_used": 0.5573309659957886, + "learning_rate": 2.9576484845877793e-07, + "loss": -0.1085, + "reward": 0.017518717795610428, + "reward_after_mean": 0.017518717795610428, + "reward_after_std": 0.46568065509200096, + "reward_before_mean": 0.5754165817052126, + "reward_before_std": 0.2668048879131675, + "reward_change_max": 0.0, + "reward_change_mean": -0.557897862046957, + "reward_change_min": -0.7572538442909718, + "reward_change_std": 0.2997015379369259, + "reward_std": 0.46568066254258156, + "rewards/accuracy_reward": 0.4375, + "rewards/cosine_scaled_reward": 0.13791657239198685, + "step": 362 + }, + { + "clip_fraction": 0.0, + "completion_length": 1770.7500305175781, + "epoch": 0.41485714285714287, + "grad_norm": 0.036160316318273544, + "kl": 0.0002955198287963867, + "lambda_div_used": 0.5742382705211639, + "learning_rate": 2.931788945420058e-07, + "loss": 0.0235, + "reward": 0.08720480650663376, + "reward_after_mean": 0.08720480650663376, + "reward_after_std": 0.4698806144297123, + "reward_before_mean": 0.624412227421999, + "reward_before_std": 0.33839546935632825, + "reward_change_max": 0.0, + "reward_change_mean": -0.5372074488550425, + "reward_change_min": -0.7525182664394379, + "reward_change_std": 0.2967495834454894, + "reward_std": 0.4698806367814541, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/cosine_scaled_reward": 0.20774555951356888, + "step": 363 + }, + { + "clip_fraction": 0.0, + "completion_length": 2871.395866394043, + "epoch": 0.416, + "grad_norm": 0.021756965667009354, + "kl": 0.0003199577331542969, + "lambda_div_used": 0.6380201950669289, + "learning_rate": 2.9060545772359305e-07, + "loss": 0.0094, + "reward": -0.06151282729115337, + "reward_after_mean": -0.06151282729115337, + "reward_after_std": 0.6731106694787741, + "reward_before_mean": 0.22438967041671276, + "reward_before_std": 0.6390752401202917, + "reward_change_max": 0.0, + "reward_change_mean": -0.2859025076031685, + "reward_change_min": -0.4525425359606743, + "reward_change_std": 0.18198375776410103, + "reward_std": 0.6731106787919998, + "rewards/accuracy_reward": 0.22916667349636555, + "rewards/cosine_scaled_reward": -0.004777010064572096, + "step": 364 + }, + { + "clip_fraction": 0.0, + "completion_length": 2915.7708892822266, + "epoch": 0.41714285714285715, + "grad_norm": 0.026546625420451164, + "kl": 0.00028133392333984375, + "lambda_div_used": 0.6235380545258522, + "learning_rate": 2.8804466342921987e-07, + "loss": 0.0013, + "reward": -0.19956049136817455, + "reward_after_mean": -0.19956049136817455, + "reward_after_std": 0.5996164344251156, + "reward_before_mean": 0.056321932934224606, + "reward_before_std": 0.5726153058931231, + "reward_change_max": 0.0, + "reward_change_mean": -0.2558824270963669, + "reward_change_min": -0.486958272755146, + "reward_change_std": 0.17727997712790966, + "reward_std": 0.5996164586395025, + "rewards/accuracy_reward": 0.16666666977107525, + "rewards/cosine_scaled_reward": -0.11034472845494747, + "step": 365 + }, + { + "clip_fraction": 0.0, + "completion_length": 1711.791690826416, + "epoch": 0.41828571428571426, + "grad_norm": 0.030281659215688705, + "kl": 0.00020581483840942383, + "lambda_div_used": 0.5937831178307533, + "learning_rate": 2.854966364683872e-07, + "loss": 0.0313, + "reward": 0.19904952123761177, + "reward_after_mean": 0.19904952123761177, + "reward_after_std": 0.5501149389892817, + "reward_before_mean": 0.7462510112673044, + "reward_before_std": 0.428714738227427, + "reward_change_max": 0.0, + "reward_change_mean": -0.5472014844417572, + "reward_change_min": -0.7698434814810753, + "reward_change_std": 0.31167063396424055, + "reward_std": 0.550114942714572, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/cosine_scaled_reward": 0.2254176577553153, + "step": 366 + }, + { + "clip_fraction": 0.0, + "completion_length": 2489.0209045410156, + "epoch": 0.41942857142857143, + "grad_norm": 0.02355087362229824, + "kl": 0.00023761391639709473, + "lambda_div_used": 0.6294166967272758, + "learning_rate": 2.829615010283344e-07, + "loss": -0.0082, + "reward": 0.1264641396701336, + "reward_after_mean": 0.1264641396701336, + "reward_after_std": 0.6499300934374332, + "reward_before_mean": 0.5483083166182041, + "reward_before_std": 0.6043886244297028, + "reward_change_max": 0.0, + "reward_change_mean": -0.4218441918492317, + "reward_change_min": -0.6975029557943344, + "reward_change_std": 0.2718219608068466, + "reward_std": 0.6499301269650459, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/cosine_scaled_reward": 0.15247498638927937, + "step": 367 + }, + { + "clip_fraction": 0.0, + "completion_length": 3003.458335876465, + "epoch": 0.4205714285714286, + "grad_norm": 0.030648062005639076, + "kl": 0.00033867359161376953, + "lambda_div_used": 0.5932426005601883, + "learning_rate": 2.8043938066798645e-07, + "loss": 0.0339, + "reward": -0.11047623306512833, + "reward_after_mean": -0.11047623306512833, + "reward_after_std": 0.46016608364880085, + "reward_before_mean": 0.24663935555145144, + "reward_before_std": 0.42351202201098204, + "reward_change_max": 0.0, + "reward_change_mean": -0.3571155872195959, + "reward_change_min": -0.523664090782404, + "reward_change_std": 0.21024074219167233, + "reward_std": 0.460166085511446, + "rewards/accuracy_reward": 0.22916667722165585, + "rewards/cosine_scaled_reward": 0.01747269369661808, + "step": 368 + }, + { + "clip_fraction": 0.0, + "completion_length": 2393.520854949951, + "epoch": 0.4217142857142857, + "grad_norm": 0.03554327413439751, + "kl": 0.00029546022415161133, + "lambda_div_used": 0.6290072500705719, + "learning_rate": 2.7793039831193133e-07, + "loss": -0.1119, + "reward": 0.045321037992835045, + "reward_after_mean": 0.045321037992835045, + "reward_after_std": 0.6632880251854658, + "reward_before_mean": 0.42809890396893024, + "reward_before_std": 0.6034117415547371, + "reward_change_max": 0.0, + "reward_change_mean": -0.3827778585255146, + "reward_change_min": -0.6426752880215645, + "reward_change_std": 0.24680283293128014, + "reward_std": 0.6632880419492722, + "rewards/accuracy_reward": 0.31250000186264515, + "rewards/cosine_scaled_reward": 0.11559889325872064, + "step": 369 + }, + { + "clip_fraction": 0.0, + "completion_length": 3084.6458587646484, + "epoch": 0.4228571428571429, + "grad_norm": 0.021294524893164635, + "kl": 0.00037282705307006836, + "lambda_div_used": 0.6392548009753227, + "learning_rate": 2.7543467624442956e-07, + "loss": 0.0248, + "reward": 0.1099370252341032, + "reward_after_mean": 0.1099370252341032, + "reward_after_std": 0.7434613928198814, + "reward_before_mean": 0.5280295421835035, + "reward_before_std": 0.6512415455654263, + "reward_change_max": 0.0, + "reward_change_mean": -0.4180925004184246, + "reward_change_min": -0.6554959528148174, + "reward_change_std": 0.2566772401332855, + "reward_std": 0.7434614114463329, + "rewards/accuracy_reward": 0.3750000074505806, + "rewards/cosine_scaled_reward": 0.15302953217178583, + "step": 370 + }, + { + "clip_fraction": 0.0, + "completion_length": 1583.729190826416, + "epoch": 0.424, + "grad_norm": 0.03987959772348404, + "kl": 0.000273287296295166, + "lambda_div_used": 0.5595665127038956, + "learning_rate": 2.729523361034538e-07, + "loss": 0.015, + "reward": -0.05838925391435623, + "reward_after_mean": -0.05838925391435623, + "reward_after_std": 0.44396297819912434, + "reward_before_mean": 0.4467340558767319, + "reward_before_std": 0.2712427484802902, + "reward_change_max": 0.0, + "reward_change_mean": -0.5051233097910881, + "reward_change_min": -0.7045671716332436, + "reward_change_std": 0.27023117896169424, + "reward_std": 0.44396298564970493, + "rewards/accuracy_reward": 0.4166666679084301, + "rewards/cosine_scaled_reward": 0.030067380517721176, + "step": 371 + }, + { + "clip_fraction": 0.0, + "completion_length": 2887.87504196167, + "epoch": 0.42514285714285716, + "grad_norm": 0.024510102346539497, + "kl": 0.0002981424331665039, + "lambda_div_used": 0.6707161664962769, + "learning_rate": 2.7048349887476037e-07, + "loss": -0.0009, + "reward": 0.25641736947000027, + "reward_after_mean": 0.25641736947000027, + "reward_after_std": 0.7552758120000362, + "reward_before_mean": 0.6394204869866371, + "reward_before_std": 0.800614426843822, + "reward_change_max": 0.0, + "reward_change_mean": -0.3830030895769596, + "reward_change_min": -0.689335536211729, + "reward_change_std": 0.28328478895127773, + "reward_std": 0.7552758287638426, + "rewards/accuracy_reward": 0.4583333469927311, + "rewards/cosine_scaled_reward": 0.18108712136745453, + "step": 372 + }, + { + "clip_fraction": 0.0, + "completion_length": 1918.5833930969238, + "epoch": 0.42628571428571427, + "grad_norm": 0.03153732046484947, + "kl": 0.00024700164794921875, + "lambda_div_used": 0.6141555905342102, + "learning_rate": 2.6802828488599294e-07, + "loss": 0.0114, + "reward": 0.040444918908178806, + "reward_after_mean": 0.040444918908178806, + "reward_after_std": 0.6271691359579563, + "reward_before_mean": 0.4637982491403818, + "reward_before_std": 0.5247453823685646, + "reward_change_max": 0.0, + "reward_change_mean": -0.42335335724055767, + "reward_change_min": -0.6717953830957413, + "reward_change_std": 0.2525172745808959, + "reward_std": 0.6271691434085369, + "rewards/accuracy_reward": 0.3541666753590107, + "rewards/cosine_scaled_reward": 0.10963157773949206, + "step": 373 + }, + { + "clip_fraction": 0.0, + "completion_length": 1947.4792213439941, + "epoch": 0.42742857142857144, + "grad_norm": 0.036099888384342194, + "kl": 0.0002910494804382324, + "lambda_div_used": 0.6011270731687546, + "learning_rate": 2.655868138008171e-07, + "loss": -0.1104, + "reward": 0.05605058930814266, + "reward_after_mean": 0.05605058930814266, + "reward_after_std": 0.614537576213479, + "reward_before_mean": 0.5277584344148636, + "reward_before_std": 0.46880532428622246, + "reward_change_max": 0.0, + "reward_change_mean": -0.4717078376561403, + "reward_change_min": -0.7001210488379002, + "reward_change_std": 0.270130792632699, + "reward_std": 0.6145375911146402, + "rewards/accuracy_reward": 0.37500000186264515, + "rewards/cosine_scaled_reward": 0.1527584195137024, + "step": 374 + }, + { + "clip_fraction": 0.0, + "completion_length": 2643.3333892822266, + "epoch": 0.42857142857142855, + "grad_norm": 0.025029828771948814, + "kl": 0.0003476142883300781, + "lambda_div_used": 0.558178536593914, + "learning_rate": 2.631592046130896e-07, + "loss": -0.0046, + "reward": 0.06168156489729881, + "reward_after_mean": 0.06168156489729881, + "reward_after_std": 0.46577400900423527, + "reward_before_mean": 0.6490027587860823, + "reward_before_std": 0.2644077790901065, + "reward_change_max": 0.0, + "reward_change_mean": -0.5873212069272995, + "reward_change_min": -0.8054014258086681, + "reward_change_std": 0.3127285521477461, + "reward_std": 0.46577401272952557, + "rewards/accuracy_reward": 0.4375, + "rewards/cosine_scaled_reward": 0.2115027718245983, + "step": 375 + }, + { + "clip_fraction": 0.0, + "completion_length": 2176.875045776367, + "epoch": 0.4297142857142857, + "grad_norm": 0.025618579238653183, + "kl": 0.00024950504302978516, + "lambda_div_used": 0.5522258281707764, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0245, + "reward": -0.24329839646816254, + "reward_after_mean": -0.24329839646816254, + "reward_after_std": 0.3648342005908489, + "reward_before_mean": 0.15543348528444767, + "reward_before_std": 0.2328620203770697, + "reward_change_max": 0.0, + "reward_change_mean": -0.3987318556755781, + "reward_change_min": -0.5627252347767353, + "reward_change_std": 0.21180008072406054, + "reward_std": 0.3648342117667198, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/cosine_scaled_reward": -0.052899875794537365, + "step": 376 + }, + { + "clip_fraction": 0.0, + "completion_length": 3302.1875610351562, + "epoch": 0.4308571428571429, + "grad_norm": 0.019104059785604477, + "kl": 0.0004137754440307617, + "lambda_div_used": 0.6249697953462601, + "learning_rate": 2.583460445215911e-07, + "loss": 0.0065, + "reward": -0.03637286019511521, + "reward_after_mean": -0.03637286019511521, + "reward_after_std": 0.6691129393875599, + "reward_before_mean": 0.30246374011039734, + "reward_before_std": 0.578639387153089, + "reward_change_max": 0.0, + "reward_change_mean": -0.3388365972787142, + "reward_change_min": -0.5369448103010654, + "reward_change_std": 0.20115663390606642, + "reward_std": 0.6691129766404629, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/cosine_scaled_reward": 0.010797052644193172, + "step": 377 + }, + { + "clip_fraction": 0.0, + "completion_length": 1876.0208568572998, + "epoch": 0.432, + "grad_norm": 0.039962347596883774, + "kl": 0.0002534538507461548, + "lambda_div_used": 0.6230520308017731, + "learning_rate": 2.5596072820445254e-07, + "loss": 0.0138, + "reward": 0.20900094881653786, + "reward_after_mean": 0.20900094881653786, + "reward_after_std": 0.685304744169116, + "reward_before_mean": 0.7048993427306414, + "reward_before_std": 0.577358863549307, + "reward_change_max": 0.0, + "reward_change_mean": -0.4958983939141035, + "reward_change_min": -0.7410164549946785, + "reward_change_std": 0.3000446343794465, + "reward_std": 0.6853047590702772, + "rewards/accuracy_reward": 0.5000000055879354, + "rewards/cosine_scaled_reward": 0.204899336444214, + "step": 378 + }, + { + "clip_fraction": 0.0, + "completion_length": 2977.7917289733887, + "epoch": 0.43314285714285716, + "grad_norm": 0.023082684725522995, + "kl": 0.00034427642822265625, + "lambda_div_used": 0.564103439450264, + "learning_rate": 2.5358974294659373e-07, + "loss": 0.0472, + "reward": -0.2481522224843502, + "reward_after_mean": -0.2481522224843502, + "reward_after_std": 0.3903357107192278, + "reward_before_mean": 0.1231729257851839, + "reward_before_std": 0.2936667911708355, + "reward_change_max": 0.0, + "reward_change_mean": -0.37132514640688896, + "reward_change_min": -0.5592780411243439, + "reward_change_std": 0.21076095290482044, + "reward_std": 0.39033573493361473, + "rewards/accuracy_reward": 0.1875, + "rewards/cosine_scaled_reward": -0.06432707794010639, + "step": 379 + }, + { + "clip_fraction": 0.0, + "completion_length": 2197.2708740234375, + "epoch": 0.4342857142857143, + "grad_norm": 0.0468011274933815, + "kl": 0.0003237128257751465, + "lambda_div_used": 0.6098255217075348, + "learning_rate": 2.512332043064913e-07, + "loss": -0.1385, + "reward": -0.01926261931657791, + "reward_after_mean": -0.01926261931657791, + "reward_after_std": 0.582806745544076, + "reward_before_mean": 0.3642494883388281, + "reward_before_std": 0.5125604961067438, + "reward_change_max": 0.0, + "reward_change_mean": -0.38351211696863174, + "reward_change_min": -0.6093635484576225, + "reward_change_std": 0.24249585159122944, + "reward_std": 0.5828067641705275, + "rewards/accuracy_reward": 0.3333333358168602, + "rewards/cosine_scaled_reward": 0.030916159972548485, + "step": 380 + }, + { + "clip_fraction": 0.0, + "completion_length": 2804.291702270508, + "epoch": 0.43542857142857144, + "grad_norm": 0.03009038046002388, + "kl": 0.00038611888885498047, + "lambda_div_used": 0.6074136793613434, + "learning_rate": 2.488912271385139e-07, + "loss": 0.0773, + "reward": -0.2030959241092205, + "reward_after_mean": -0.2030959241092205, + "reward_after_std": 0.5294999033212662, + "reward_before_mean": 0.0848972403910011, + "reward_before_std": 0.4909538859501481, + "reward_change_max": 0.0, + "reward_change_mean": -0.287993174046278, + "reward_change_min": -0.4920281432569027, + "reward_change_std": 0.1843261569738388, + "reward_std": 0.5294999219477177, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/cosine_scaled_reward": -0.08176943706348538, + "step": 381 + }, + { + "clip_fraction": 0.0, + "completion_length": 1741.7292022705078, + "epoch": 0.43657142857142855, + "grad_norm": 0.02836902253329754, + "kl": 0.000269085168838501, + "lambda_div_used": 0.593373216688633, + "learning_rate": 2.465639255873246e-07, + "loss": 0.0017, + "reward": -0.2921748459339142, + "reward_after_mean": -0.2921748459339142, + "reward_after_std": 0.4851537048816681, + "reward_before_mean": -0.02789500029757619, + "reward_before_std": 0.4276847830042243, + "reward_change_max": 0.0, + "reward_change_mean": -0.2642798572778702, + "reward_change_min": -0.4220714569091797, + "reward_change_std": 0.156461289152503, + "reward_std": 0.485153716057539, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/cosine_scaled_reward": -0.152894988656044, + "step": 382 + }, + { + "clip_fraction": 0.0, + "completion_length": 2540.208366394043, + "epoch": 0.4377142857142857, + "grad_norm": 0.027282925322651863, + "kl": 0.00039958953857421875, + "lambda_div_used": 0.6196694001555443, + "learning_rate": 2.4425141308231765e-07, + "loss": -0.0249, + "reward": -0.009514571633189917, + "reward_after_mean": -0.009514571633189917, + "reward_after_std": 0.6358621753752232, + "reward_before_mean": 0.3640937558375299, + "reward_before_std": 0.5521808844059706, + "reward_change_max": 0.0, + "reward_change_mean": -0.3736083246767521, + "reward_change_min": -0.5938910432159901, + "reward_change_std": 0.22305074147880077, + "reward_std": 0.6358622014522552, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/cosine_scaled_reward": 0.07242710120044649, + "step": 383 + }, + { + "clip_fraction": 0.0, + "completion_length": 2133.9166984558105, + "epoch": 0.43885714285714283, + "grad_norm": 0.031124358996748924, + "kl": 0.00030043721199035645, + "lambda_div_used": 0.6193316578865051, + "learning_rate": 2.4195380233209006e-07, + "loss": -0.0227, + "reward": 0.32511539570987225, + "reward_after_mean": 0.32511539570987225, + "reward_after_std": 0.7301186248660088, + "reward_before_mean": 0.913657930213958, + "reward_before_std": 0.5516102942638099, + "reward_change_max": 0.0, + "reward_change_mean": -0.588542552664876, + "reward_change_min": -0.8305801004171371, + "reward_change_std": 0.32795302756130695, + "reward_std": 0.7301186472177505, + "rewards/accuracy_reward": 0.5833333395421505, + "rewards/cosine_scaled_reward": 0.3303246097639203, + "step": 384 + }, + { + "clip_fraction": 0.0, + "completion_length": 2633.770851135254, + "epoch": 0.44, + "grad_norm": 0.018747175112366676, + "kl": 0.00027829408645629883, + "lambda_div_used": 0.59091367572546, + "learning_rate": 2.3967120531894857e-07, + "loss": 0.0052, + "reward": -0.16111253947019577, + "reward_after_mean": -0.16111253947019577, + "reward_after_std": 0.4700228702276945, + "reward_before_mean": 0.17826138995587826, + "reward_before_std": 0.41598498076200485, + "reward_change_max": 0.0, + "reward_change_mean": -0.3393739238381386, + "reward_change_min": -0.5320228524506092, + "reward_change_std": 0.20085815154016018, + "reward_std": 0.4700228702276945, + "rewards/accuracy_reward": 0.22916666977107525, + "rewards/cosine_scaled_reward": -0.05090527608990669, + "step": 385 + }, + { + "clip_fraction": 0.0, + "completion_length": 2641.6250762939453, + "epoch": 0.44114285714285717, + "grad_norm": 0.022516794502735138, + "kl": 0.00033086538314819336, + "lambda_div_used": 0.5835114791989326, + "learning_rate": 2.374037332934512e-07, + "loss": -0.0898, + "reward": -0.16643539629876614, + "reward_after_mean": -0.16643539629876614, + "reward_after_std": 0.47341430373489857, + "reward_before_mean": 0.20273982174694538, + "reward_before_std": 0.38260515965521336, + "reward_change_max": 0.0, + "reward_change_mean": -0.36917522735893726, + "reward_change_min": -0.5404521636664867, + "reward_change_std": 0.20916004106402397, + "reward_std": 0.4734143167734146, + "rewards/accuracy_reward": 0.22916666977107525, + "rewards/cosine_scaled_reward": -0.02642684616148472, + "step": 386 + }, + { + "clip_fraction": 0.0, + "completion_length": 2953.6875610351562, + "epoch": 0.4422857142857143, + "grad_norm": 0.023601215332746506, + "kl": 0.0003770887851715088, + "lambda_div_used": 0.6255086436867714, + "learning_rate": 2.3515149676898552e-07, + "loss": 0.013, + "reward": -0.055427778512239456, + "reward_after_mean": -0.055427778512239456, + "reward_after_std": 0.6565756388008595, + "reward_before_mean": 0.2887960313819349, + "reward_before_std": 0.582161720842123, + "reward_change_max": 0.0, + "reward_change_mean": -0.3442238178104162, + "reward_change_min": -0.555218169465661, + "reward_change_std": 0.2120614117011428, + "reward_std": 0.6565756406635046, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/cosine_scaled_reward": -0.0028706385055556893, + "step": 387 + }, + { + "clip_fraction": 0.0, + "completion_length": 2360.6667251586914, + "epoch": 0.44342857142857145, + "grad_norm": 0.026174332946538925, + "kl": 0.00029283761978149414, + "lambda_div_used": 0.5821729674935341, + "learning_rate": 2.3291460551638237e-07, + "loss": -0.0288, + "reward": -0.10938419215381145, + "reward_after_mean": -0.10938419215381145, + "reward_after_std": 0.4824158512055874, + "reward_before_mean": 0.2964679952710867, + "reward_before_std": 0.37308686412870884, + "reward_change_max": 0.0, + "reward_change_mean": -0.405852185562253, + "reward_change_min": -0.5691216923296452, + "reward_change_std": 0.2231999458745122, + "reward_std": 0.4824158661067486, + "rewards/accuracy_reward": 0.2708333395421505, + "rewards/cosine_scaled_reward": 0.025634657591581345, + "step": 388 + }, + { + "clip_fraction": 0.0, + "completion_length": 2439.229232788086, + "epoch": 0.44457142857142856, + "grad_norm": 0.023821894079446793, + "kl": 0.00028192996978759766, + "lambda_div_used": 0.6302470341324806, + "learning_rate": 2.306931685585657e-07, + "loss": 0.0324, + "reward": 0.04070591554045677, + "reward_after_mean": 0.04070591554045677, + "reward_after_std": 0.6089895591139793, + "reward_before_mean": 0.3929547220468521, + "reward_before_std": 0.6121768653392792, + "reward_change_max": 0.0, + "reward_change_mean": -0.35224880650639534, + "reward_change_min": -0.6027859784662724, + "reward_change_std": 0.2432803064584732, + "reward_std": 0.6089895665645599, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/cosine_scaled_reward": 0.10128805413842201, + "step": 389 + }, + { + "clip_fraction": 0.0, + "completion_length": 2221.3750610351562, + "epoch": 0.44571428571428573, + "grad_norm": 0.026545513421297073, + "kl": 0.0002053976058959961, + "lambda_div_used": 0.5998165532946587, + "learning_rate": 2.2848729416523859e-07, + "loss": 0.0542, + "reward": 0.11247721314430237, + "reward_after_mean": 0.11247721314430237, + "reward_after_std": 0.5243697017431259, + "reward_before_mean": 0.5878691142424941, + "reward_before_std": 0.4597471170127392, + "reward_change_max": 0.0, + "reward_change_mean": -0.4753919020295143, + "reward_change_min": -0.7015566639602184, + "reward_change_std": 0.28336913883686066, + "reward_std": 0.5243697185069323, + "rewards/accuracy_reward": 0.3750000074505806, + "rewards/cosine_scaled_reward": 0.2128690993413329, + "step": 390 + }, + { + "clip_fraction": 0.0, + "completion_length": 2397.791736602783, + "epoch": 0.44685714285714284, + "grad_norm": 0.030633823946118355, + "kl": 0.0002848505973815918, + "lambda_div_used": 0.6453134343028069, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.0079, + "reward": 0.32186132250353694, + "reward_after_mean": 0.32186132250353694, + "reward_after_std": 0.7361500542610884, + "reward_before_mean": 0.8155574453994632, + "reward_before_std": 0.6687694359570742, + "reward_change_max": 0.0, + "reward_change_mean": -0.4936961196362972, + "reward_change_min": -0.7406940795481205, + "reward_change_std": 0.2951981630176306, + "reward_std": 0.7361500766128302, + "rewards/accuracy_reward": 0.5208333507180214, + "rewards/cosine_scaled_reward": 0.2947241172660142, + "step": 391 + }, + { + "clip_fraction": 0.0, + "completion_length": 1701.3541946411133, + "epoch": 0.448, + "grad_norm": 0.030896564945578575, + "kl": 0.00023484230041503906, + "lambda_div_used": 0.5580313578248024, + "learning_rate": 2.2412266235313973e-07, + "loss": 0.0276, + "reward": -0.1474175527691841, + "reward_after_mean": -0.1474175527691841, + "reward_after_std": 0.4200621973723173, + "reward_before_mean": 0.3056653430685401, + "reward_before_std": 0.2651460962370038, + "reward_change_max": 0.0, + "reward_change_mean": -0.45308290608227253, + "reward_change_min": -0.6591883115470409, + "reward_change_std": 0.24698374886065722, + "reward_std": 0.42006222531199455, + "rewards/accuracy_reward": 0.2916666679084301, + "rewards/cosine_scaled_reward": 0.013998678419739008, + "step": 392 + }, + { + "clip_fraction": 0.0, + "completion_length": 2144.7708892822266, + "epoch": 0.4491428571428571, + "grad_norm": 0.025813451036810875, + "kl": 0.00031191110610961914, + "lambda_div_used": 0.6718219220638275, + "learning_rate": 2.2196411766036487e-07, + "loss": -0.0357, + "reward": 0.048092352226376534, + "reward_after_mean": 0.048092352226376534, + "reward_after_std": 0.7942012958228588, + "reward_before_mean": 0.32687312876805663, + "reward_before_std": 0.8060374613851309, + "reward_change_max": 0.0, + "reward_change_mean": -0.27878078632056713, + "reward_change_min": -0.5697837248444557, + "reward_change_std": 0.22098596021533012, + "reward_std": 0.7942013349384069, + "rewards/accuracy_reward": 0.31250000558793545, + "rewards/cosine_scaled_reward": 0.014373119222000241, + "step": 393 + }, + { + "clip_fraction": 0.0, + "completion_length": 3021.5833740234375, + "epoch": 0.4502857142857143, + "grad_norm": 0.02360186167061329, + "kl": 0.00038444995880126953, + "lambda_div_used": 0.6319213733077049, + "learning_rate": 2.1982156097370557e-07, + "loss": 0.052, + "reward": -0.20860249735414982, + "reward_after_mean": -0.20860249735414982, + "reward_after_std": 0.6297264527529478, + "reward_before_mean": 0.03818079084157944, + "reward_before_std": 0.6159613355994225, + "reward_change_max": 0.0, + "reward_change_mean": -0.2467832900583744, + "reward_change_min": -0.5391863323748112, + "reward_change_std": 0.1899872226640582, + "reward_std": 0.6297264751046896, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/cosine_scaled_reward": -0.12848588544875383, + "step": 394 + }, + { + "clip_fraction": 0.0, + "completion_length": 2135.8333473205566, + "epoch": 0.4514285714285714, + "grad_norm": 0.04946435987949371, + "kl": 0.00024586915969848633, + "lambda_div_used": 0.576830618083477, + "learning_rate": 2.1769509671835223e-07, + "loss": -0.0301, + "reward": -0.24670689832419157, + "reward_after_mean": -0.24670689832419157, + "reward_after_std": 0.4664156064391136, + "reward_before_mean": 0.09556343220174313, + "reward_before_std": 0.35311094112694263, + "reward_change_max": 0.0, + "reward_change_mean": -0.3422703631222248, + "reward_change_min": -0.5164104513823986, + "reward_change_std": 0.19275081250816584, + "reward_std": 0.466415636241436, + "rewards/accuracy_reward": 0.18750000186264515, + "rewards/cosine_scaled_reward": -0.09193656174466014, + "step": 395 + }, + { + "clip_fraction": 0.0, + "completion_length": 2614.750068664551, + "epoch": 0.45257142857142857, + "grad_norm": 0.021299051120877266, + "kl": 0.00028631091117858887, + "lambda_div_used": 0.6214602738618851, + "learning_rate": 2.1558482853517253e-07, + "loss": 0.0454, + "reward": 0.09535084664821625, + "reward_after_mean": 0.09535084664821625, + "reward_after_std": 0.6240330375730991, + "reward_before_mean": 0.5079526733607054, + "reward_before_std": 0.5631251083686948, + "reward_change_max": 0.0, + "reward_change_mean": -0.41260186582803726, + "reward_change_min": -0.6420417241752148, + "reward_change_std": 0.2541531687602401, + "reward_std": 0.6240330524742603, + "rewards/accuracy_reward": 0.37500000558793545, + "rewards/cosine_scaled_reward": 0.13295269757509232, + "step": 396 + }, + { + "clip_fraction": 0.0, + "completion_length": 2262.5625610351562, + "epoch": 0.45371428571428574, + "grad_norm": 0.023174487054347992, + "kl": 0.00028808414936065674, + "lambda_div_used": 0.620752289891243, + "learning_rate": 2.134908592756607e-07, + "loss": -0.0334, + "reward": 0.0681952117010951, + "reward_after_mean": 0.0681952117010951, + "reward_after_std": 0.6165110263973475, + "reward_before_mean": 0.48235524632036686, + "reward_before_std": 0.5585681181401014, + "reward_change_max": 0.0, + "reward_change_mean": -0.41416002810001373, + "reward_change_min": -0.6492009982466698, + "reward_change_std": 0.2585160303860903, + "reward_std": 0.6165110506117344, + "rewards/accuracy_reward": 0.35416667349636555, + "rewards/cosine_scaled_reward": 0.1281885566713754, + "step": 397 + }, + { + "clip_fraction": 0.0, + "completion_length": 2219.1875534057617, + "epoch": 0.45485714285714285, + "grad_norm": 0.02591089904308319, + "kl": 0.0003045201301574707, + "lambda_div_used": 0.6068568229675293, + "learning_rate": 2.1141329099692406e-07, + "loss": 0.0302, + "reward": -0.10764243453741074, + "reward_after_mean": -0.10764243453741074, + "reward_after_std": 0.5858584549278021, + "reward_before_mean": 0.2538683768361807, + "reward_before_std": 0.4971063416451216, + "reward_change_max": 0.0, + "reward_change_mean": -0.36151083186268806, + "reward_change_min": -0.6307843886315823, + "reward_change_std": 0.2273276075720787, + "reward_std": 0.5858584903180599, + "rewards/accuracy_reward": 0.2500000037252903, + "rewards/cosine_scaled_reward": 0.003868376836180687, + "step": 398 + }, + { + "clip_fraction": 0.0, + "completion_length": 2028.7292175292969, + "epoch": 0.456, + "grad_norm": 0.023573419079184532, + "kl": 0.00022789835929870605, + "lambda_div_used": 0.6212376356124878, + "learning_rate": 2.0935222495670968e-07, + "loss": 0.0692, + "reward": 0.19389131292700768, + "reward_after_mean": 0.19389131292700768, + "reward_after_std": 0.6119374781847, + "reward_before_mean": 0.656364331021905, + "reward_before_std": 0.5594985205680132, + "reward_change_max": 0.0, + "reward_change_mean": -0.4624730013310909, + "reward_change_min": -0.6958450116217136, + "reward_change_std": 0.28642346803098917, + "reward_std": 0.6119375005364418, + "rewards/accuracy_reward": 0.479166679084301, + "rewards/cosine_scaled_reward": 0.1771976239979267, + "step": 399 + }, + { + "clip_fraction": 0.0, + "completion_length": 1497.520881652832, + "epoch": 0.45714285714285713, + "grad_norm": 0.034730274230241776, + "kl": 0.00023819506168365479, + "lambda_div_used": 0.6499348282814026, + "learning_rate": 2.0730776160846853e-07, + "loss": -0.0384, + "reward": 0.35817267652601004, + "reward_after_mean": 0.35817267652601004, + "reward_after_std": 0.7002598587423563, + "reward_before_mean": 0.8351266942918301, + "reward_before_std": 0.6965709868818521, + "reward_change_max": 0.0, + "reward_change_mean": -0.47695402428507805, + "reward_change_min": -0.7681624032557011, + "reward_change_std": 0.3101581484079361, + "reward_std": 0.7002598755061626, + "rewards/accuracy_reward": 0.5208333469927311, + "rewards/cosine_scaled_reward": 0.31429335149005055, + "step": 400 + }, + { + "clip_fraction": 0.0, + "completion_length": 2817.020851135254, + "epoch": 0.4582857142857143, + "grad_norm": 0.024576053023338318, + "kl": 0.00034928321838378906, + "lambda_div_used": 0.5570264235138893, + "learning_rate": 2.0528000059645995e-07, + "loss": 0.0255, + "reward": -0.17014812678098679, + "reward_after_mean": -0.17014812678098679, + "reward_after_std": 0.3594451379030943, + "reward_before_mean": 0.25060533825308084, + "reward_before_std": 0.25669852178543806, + "reward_change_max": 0.0, + "reward_change_mean": -0.4207534771412611, + "reward_change_min": -0.5965141579508781, + "reward_change_std": 0.2299406472593546, + "reward_std": 0.35944515466690063, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/cosine_scaled_reward": 0.02143866289407015, + "step": 401 + }, + { + "clip_fraction": 0.0, + "completion_length": 2323.062530517578, + "epoch": 0.4594285714285714, + "grad_norm": 0.02968554012477398, + "kl": 0.00033861398696899414, + "lambda_div_used": 0.579738162457943, + "learning_rate": 2.032690407508949e-07, + "loss": -0.0104, + "reward": -0.20102215744554996, + "reward_after_mean": -0.20102215744554996, + "reward_after_std": 0.4696238599717617, + "reward_before_mean": 0.16348140873014927, + "reward_before_std": 0.36563692055642605, + "reward_change_max": 0.0, + "reward_change_mean": -0.36450355127453804, + "reward_change_min": -0.5928855016827583, + "reward_change_std": 0.21332142874598503, + "reward_std": 0.46962387673556805, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/cosine_scaled_reward": -0.044851938262581825, + "step": 402 + }, + { + "clip_fraction": 0.0, + "completion_length": 1727.2708587646484, + "epoch": 0.4605714285714286, + "grad_norm": 0.03782231733202934, + "kl": 0.0002620220184326172, + "lambda_div_used": 0.5557271614670753, + "learning_rate": 2.0127498008311922e-07, + "loss": 0.0459, + "reward": -0.10220484808087349, + "reward_after_mean": -0.10220484808087349, + "reward_after_std": 0.4127990063279867, + "reward_before_mean": 0.37698337249457836, + "reward_before_std": 0.25216651428490877, + "reward_change_max": 0.0, + "reward_change_mean": -0.4791882447898388, + "reward_change_min": -0.6764676049351692, + "reward_change_std": 0.25756980665028095, + "reward_std": 0.4127990175038576, + "rewards/accuracy_reward": 0.3125, + "rewards/cosine_scaled_reward": 0.06448337621986866, + "step": 403 + }, + { + "clip_fraction": 0.0, + "completion_length": 2469.5000076293945, + "epoch": 0.4617142857142857, + "grad_norm": 0.0334598608314991, + "kl": 0.00029639899730682373, + "lambda_div_used": 0.5738808363676071, + "learning_rate": 1.9929791578083655e-07, + "loss": 0.0173, + "reward": 0.004905553534626961, + "reward_after_mean": 0.004905553534626961, + "reward_after_std": 0.48677791468799114, + "reward_before_mean": 0.506166247650981, + "reward_before_std": 0.33276718482375145, + "reward_change_max": 0.0, + "reward_change_mean": -0.5012606829404831, + "reward_change_min": -0.6952128820121288, + "reward_change_std": 0.26712857093662024, + "reward_std": 0.4867779165506363, + "rewards/accuracy_reward": 0.37500000558793545, + "rewards/cosine_scaled_reward": 0.1311662346124649, + "step": 404 + }, + { + "clip_fraction": 0.0, + "completion_length": 1970.9167022705078, + "epoch": 0.46285714285714286, + "grad_norm": 0.03668729215860367, + "kl": 0.00032258033752441406, + "lambda_div_used": 0.6146402955055237, + "learning_rate": 1.9733794420337213e-07, + "loss": 0.0422, + "reward": 0.1430281363427639, + "reward_after_mean": 0.1430281363427639, + "reward_after_std": 0.5927682984620333, + "reward_before_mean": 0.610472509637475, + "reward_before_std": 0.530167305842042, + "reward_change_max": 0.0, + "reward_change_mean": -0.4674443490803242, + "reward_change_min": -0.726629700511694, + "reward_change_std": 0.28909510001540184, + "reward_std": 0.592768307775259, + "rewards/accuracy_reward": 0.4375000111758709, + "rewards/cosine_scaled_reward": 0.17297248914837837, + "step": 405 + }, + { + "clip_fraction": 0.0, + "completion_length": 2001.6875228881836, + "epoch": 0.464, + "grad_norm": 0.022773489356040955, + "kl": 0.0002652406692504883, + "lambda_div_used": 0.6149442344903946, + "learning_rate": 1.9539516087697517e-07, + "loss": -0.01, + "reward": 0.13723283261060715, + "reward_after_mean": 0.13723283261060715, + "reward_after_std": 0.6057328097522259, + "reward_before_mean": 0.5846194308251143, + "reward_before_std": 0.5296543845906854, + "reward_change_max": 0.0, + "reward_change_mean": -0.44738658517599106, + "reward_change_min": -0.6640791147947311, + "reward_change_std": 0.267610440030694, + "reward_std": 0.6057328246533871, + "rewards/accuracy_reward": 0.41666667722165585, + "rewards/cosine_scaled_reward": 0.16795274708420038, + "step": 406 + }, + { + "clip_fraction": 0.0, + "completion_length": 2183.3333892822266, + "epoch": 0.46514285714285714, + "grad_norm": 0.029050234705209732, + "kl": 0.00023674964904785156, + "lambda_div_used": 0.5692102611064911, + "learning_rate": 1.934696604901642e-07, + "loss": -0.002, + "reward": 0.08383433520793915, + "reward_after_mean": 0.08383433520793915, + "reward_after_std": 0.5238823061808944, + "reward_before_mean": 0.6431381715228781, + "reward_before_std": 0.31836726085748523, + "reward_change_max": 0.0, + "reward_change_mean": -0.5593038275837898, + "reward_change_min": -0.7351665589958429, + "reward_change_std": 0.29199546575546265, + "reward_std": 0.5238823387771845, + "rewards/accuracy_reward": 0.47916666977107525, + "rewards/cosine_scaled_reward": 0.16397148557007313, + "step": 407 + }, + { + "clip_fraction": 0.0, + "completion_length": 2340.208396911621, + "epoch": 0.4662857142857143, + "grad_norm": 0.0308608990162611, + "kl": 0.0002751350402832031, + "lambda_div_used": 0.595428429543972, + "learning_rate": 1.915615368891117e-07, + "loss": -0.0448, + "reward": -0.14165206719189882, + "reward_after_mean": -0.14165206719189882, + "reward_after_std": 0.5383005198091269, + "reward_before_mean": 0.22720495285466313, + "reward_before_std": 0.4390671527944505, + "reward_change_max": 0.0, + "reward_change_mean": -0.3688570037484169, + "reward_change_min": -0.560546163469553, + "reward_change_std": 0.21708335354924202, + "reward_std": 0.5383005253970623, + "rewards/accuracy_reward": 0.27083333767950535, + "rewards/cosine_scaled_reward": -0.043628389947116375, + "step": 408 + }, + { + "clip_fraction": 0.0, + "completion_length": 3338.3333740234375, + "epoch": 0.4674285714285714, + "grad_norm": 0.017489202320575714, + "kl": 0.0003743171691894531, + "lambda_div_used": 0.641681618988514, + "learning_rate": 1.8967088307307e-07, + "loss": 0.0221, + "reward": 0.0034925403306260705, + "reward_after_mean": 0.0034925403306260705, + "reward_after_std": 0.7239628247916698, + "reward_before_mean": 0.3585042329505086, + "reward_before_std": 0.6582456473261118, + "reward_change_max": 0.0, + "reward_change_mean": -0.3550117015838623, + "reward_change_min": -0.627179455012083, + "reward_change_std": 0.23101032618433237, + "reward_std": 0.7239628490060568, + "rewards/accuracy_reward": 0.3125000037252903, + "rewards/cosine_scaled_reward": 0.04600422829389572, + "step": 409 + }, + { + "clip_fraction": 0.0, + "completion_length": 2387.291690826416, + "epoch": 0.4685714285714286, + "grad_norm": 0.028829436749219894, + "kl": 0.000278472900390625, + "lambda_div_used": 0.6026698350906372, + "learning_rate": 1.8779779118983867e-07, + "loss": -0.0184, + "reward": -0.08084386587142944, + "reward_after_mean": -0.08084386587142944, + "reward_after_std": 0.5610201843082905, + "reward_before_mean": 0.2896402692422271, + "reward_before_std": 0.4759355755522847, + "reward_change_max": 0.0, + "reward_change_mean": -0.37048413045704365, + "reward_change_min": -0.5802675113081932, + "reward_change_std": 0.2220335192978382, + "reward_std": 0.561020215973258, + "rewards/accuracy_reward": 0.27083333767950535, + "rewards/cosine_scaled_reward": 0.018806922249495983, + "step": 410 + }, + { + "clip_fraction": 0.0, + "completion_length": 2483.9791717529297, + "epoch": 0.4697142857142857, + "grad_norm": 0.02735401690006256, + "kl": 0.00030410289764404297, + "lambda_div_used": 0.5891791060566902, + "learning_rate": 1.8594235253127372e-07, + "loss": 0.0464, + "reward": -0.09278726205229759, + "reward_after_mean": -0.09278726205229759, + "reward_after_std": 0.48578726314008236, + "reward_before_mean": 0.29704072792083025, + "reward_before_std": 0.41402094066143036, + "reward_change_max": 0.0, + "reward_change_mean": -0.3898279666900635, + "reward_change_min": -0.6011387817561626, + "reward_change_std": 0.234495647251606, + "reward_std": 0.48578727059066296, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/cosine_scaled_reward": 0.026207380928099155, + "step": 411 + }, + { + "clip_fraction": 0.0, + "completion_length": 2443.000057220459, + "epoch": 0.47085714285714286, + "grad_norm": 0.027080198749899864, + "kl": 0.0003039836883544922, + "lambda_div_used": 0.626535639166832, + "learning_rate": 1.8410465752883758e-07, + "loss": 0.0382, + "reward": 0.18809181079268456, + "reward_after_mean": 0.18809181079268456, + "reward_after_std": 0.6282138898968697, + "reward_before_mean": 0.6274868324398994, + "reward_before_std": 0.5877205710858107, + "reward_change_max": 0.0, + "reward_change_mean": -0.4393950141966343, + "reward_change_min": -0.6751919612288475, + "reward_change_std": 0.2793376138433814, + "reward_std": 0.6282139029353857, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/cosine_scaled_reward": 0.16915349289774895, + "step": 412 + }, + { + "clip_fraction": 0.0, + "completion_length": 2277.0417251586914, + "epoch": 0.472, + "grad_norm": 0.02998846024274826, + "kl": 0.00023829936981201172, + "lambda_div_used": 0.6537542790174484, + "learning_rate": 1.822847957491922e-07, + "loss": 0.023, + "reward": 0.0677551869302988, + "reward_after_mean": 0.0677551869302988, + "reward_after_std": 0.6939267106354237, + "reward_before_mean": 0.3922595623880625, + "reward_before_std": 0.7187845781445503, + "reward_change_max": 0.0, + "reward_change_mean": -0.3245043680071831, + "reward_change_min": -0.6250169165432453, + "reward_change_std": 0.24318813905119896, + "reward_std": 0.6939267329871655, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/cosine_scaled_reward": 0.05892622594546992, + "step": 413 + }, + { + "clip_fraction": 0.0, + "completion_length": 3080.5625, + "epoch": 0.47314285714285714, + "grad_norm": 0.016316330060362816, + "kl": 0.00028955936431884766, + "lambda_div_used": 0.6257347464561462, + "learning_rate": 1.804828558898332e-07, + "loss": 0.0235, + "reward": -0.13909682049416006, + "reward_after_mean": -0.13909682049416006, + "reward_after_std": 0.6051931101828814, + "reward_before_mean": 0.1403335351496935, + "reward_before_std": 0.583410625346005, + "reward_change_max": 0.0, + "reward_change_mean": -0.2794303596019745, + "reward_change_min": -0.45995376631617546, + "reward_change_std": 0.1819247854873538, + "reward_std": 0.6051931362599134, + "rewards/accuracy_reward": 0.1875000037252903, + "rewards/cosine_scaled_reward": -0.04716646298766136, + "step": 414 + }, + { + "clip_fraction": 0.0, + "completion_length": 3129.0208740234375, + "epoch": 0.4742857142857143, + "grad_norm": 0.020433053374290466, + "kl": 0.0003305673599243164, + "lambda_div_used": 0.6250654757022858, + "learning_rate": 1.7869892577476722e-07, + "loss": 0.0036, + "reward": -0.22648247238248587, + "reward_after_mean": -0.22648247238248587, + "reward_after_std": 0.6230853609740734, + "reward_before_mean": 0.01668240688741207, + "reward_before_std": 0.5801385007798672, + "reward_change_max": 0.0, + "reward_change_mean": -0.24316489323973656, + "reward_change_min": -0.43964531272649765, + "reward_change_std": 0.1610111938789487, + "reward_std": 0.6230853945016861, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/cosine_scaled_reward": -0.12915092520415783, + "step": 415 + }, + { + "clip_fraction": 0.0, + "completion_length": 1350.31254196167, + "epoch": 0.4754285714285714, + "grad_norm": 0.029406633228063583, + "kl": 0.00015437602996826172, + "lambda_div_used": 0.6057035326957703, + "learning_rate": 1.7693309235023127e-07, + "loss": -0.006, + "reward": -0.15187997743487358, + "reward_after_mean": -0.15187997743487358, + "reward_after_std": 0.500312227755785, + "reward_before_mean": 0.1518111266195774, + "reward_before_std": 0.4886645954102278, + "reward_change_max": 0.0, + "reward_change_mean": -0.3036911189556122, + "reward_change_min": -0.5342599004507065, + "reward_change_std": 0.20331810228526592, + "reward_std": 0.5003122296184301, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/cosine_scaled_reward": -0.05652220547199249, + "step": 416 + }, + { + "clip_fraction": 0.0, + "completion_length": 3118.062530517578, + "epoch": 0.4765714285714286, + "grad_norm": 0.022452035918831825, + "kl": 0.00038176774978637695, + "lambda_div_used": 0.5929878354072571, + "learning_rate": 1.7518544168045524e-07, + "loss": -0.0269, + "reward": -0.31791230058297515, + "reward_after_mean": -0.31791230058297515, + "reward_after_std": 0.5058948453515768, + "reward_before_mean": -0.05459975823760033, + "reward_before_std": 0.4272688911296427, + "reward_change_max": 0.0, + "reward_change_mean": -0.2633125390857458, + "reward_change_min": -0.4145249240100384, + "reward_change_std": 0.15364530310034752, + "reward_std": 0.5058948528021574, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/cosine_scaled_reward": -0.13793309262837283, + "step": 417 + }, + { + "clip_fraction": 0.0, + "completion_length": 2075.0417289733887, + "epoch": 0.4777142857142857, + "grad_norm": 0.03444957733154297, + "kl": 0.00030410289764404297, + "lambda_div_used": 0.5850719586014748, + "learning_rate": 1.7345605894346726e-07, + "loss": -0.0603, + "reward": 0.03764221305027604, + "reward_after_mean": 0.03764221305027604, + "reward_after_std": 0.5699926447123289, + "reward_before_mean": 0.5322978757321835, + "reward_before_std": 0.3904368221992627, + "reward_change_max": 0.0, + "reward_change_mean": -0.4946556333452463, + "reward_change_min": -0.6658468469977379, + "reward_change_std": 0.2617466766387224, + "reward_std": 0.5699926633387804, + "rewards/accuracy_reward": 0.3750000037252903, + "rewards/cosine_scaled_reward": 0.15729784907307476, + "step": 418 + }, + { + "clip_fraction": 0.0, + "completion_length": 2422.062515258789, + "epoch": 0.47885714285714287, + "grad_norm": 0.023973578587174416, + "kl": 0.0002415478229522705, + "lambda_div_used": 0.5876604542136192, + "learning_rate": 1.7174502842694212e-07, + "loss": 0.0464, + "reward": -0.01938623934984207, + "reward_after_mean": -0.01938623934984207, + "reward_after_std": 0.531193170696497, + "reward_before_mean": 0.4354735445231199, + "reward_before_std": 0.4033324606716633, + "reward_change_max": 0.0, + "reward_change_mean": -0.45485977828502655, + "reward_change_min": -0.6743562705814838, + "reward_change_std": 0.2555869175121188, + "reward_std": 0.5311931855976582, + "rewards/accuracy_reward": 0.33333333395421505, + "rewards/cosine_scaled_reward": 0.10214020684361458, + "step": 419 + }, + { + "clip_fraction": 0.0, + "completion_length": 1696.083381652832, + "epoch": 0.48, + "grad_norm": 0.04010794684290886, + "kl": 0.0002671480178833008, + "lambda_div_used": 0.5893955454230309, + "learning_rate": 1.7005243352409333e-07, + "loss": -0.0499, + "reward": -0.18572357669472694, + "reward_after_mean": -0.18572357669472694, + "reward_after_std": 0.4607646930962801, + "reward_before_mean": 0.12988583371043205, + "reward_before_std": 0.416760787833482, + "reward_change_max": 0.0, + "reward_change_mean": -0.31560939736664295, + "reward_change_min": -0.5215952098369598, + "reward_change_std": 0.1989196827635169, + "reward_std": 0.4607647117227316, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/cosine_scaled_reward": -0.14094750862568617, + "step": 420 + }, + { + "clip_fraction": 0.0, + "completion_length": 2925.3333892822266, + "epoch": 0.48114285714285715, + "grad_norm": 0.025533905252814293, + "kl": 0.0003809928894042969, + "lambda_div_used": 0.6258808895945549, + "learning_rate": 1.6837835672960831e-07, + "loss": 0.0252, + "reward": -0.1876915767788887, + "reward_after_mean": -0.1876915767788887, + "reward_after_std": 0.6121686920523643, + "reward_before_mean": 0.07189313881099224, + "reward_before_std": 0.5847431821748614, + "reward_change_max": 0.0, + "reward_change_mean": -0.25958471931517124, + "reward_change_min": -0.5139855779707432, + "reward_change_std": 0.1843523010611534, + "reward_std": 0.6121687144041061, + "rewards/accuracy_reward": 0.16666666977107525, + "rewards/cosine_scaled_reward": -0.09477353328838944, + "step": 421 + }, + { + "clip_fraction": 0.0, + "completion_length": 2811.4166946411133, + "epoch": 0.48228571428571426, + "grad_norm": 0.020569510757923126, + "kl": 0.00035816431045532227, + "lambda_div_used": 0.5612113624811172, + "learning_rate": 1.6672287963562852e-07, + "loss": 0.0257, + "reward": -0.22088398411870003, + "reward_after_mean": -0.22088398411870003, + "reward_after_std": 0.37868294678628445, + "reward_before_mean": 0.17238148115575314, + "reward_before_std": 0.27797973807901144, + "reward_change_max": 0.0, + "reward_change_mean": -0.3932654559612274, + "reward_change_min": -0.5825164802372456, + "reward_change_std": 0.22011223249137402, + "reward_std": 0.3786829560995102, + "rewards/accuracy_reward": 0.1875, + "rewards/cosine_scaled_reward": -0.015118520706892014, + "step": 422 + }, + { + "clip_fraction": 0.0, + "completion_length": 2948.2916870117188, + "epoch": 0.48342857142857143, + "grad_norm": 0.021468866616487503, + "kl": 0.0003040432929992676, + "lambda_div_used": 0.6255258545279503, + "learning_rate": 1.6508608292777203e-07, + "loss": -0.0097, + "reward": -0.10086626000702381, + "reward_after_mean": -0.10086626000702381, + "reward_after_std": 0.5887319762259722, + "reward_before_mean": 0.19233786687254906, + "reward_before_std": 0.5806169025599957, + "reward_change_max": 0.0, + "reward_change_mean": -0.2932041045278311, + "reward_change_min": -0.5194867514073849, + "reward_change_std": 0.19756229128688574, + "reward_std": 0.5887319948524237, + "rewards/accuracy_reward": 0.22916667349636555, + "rewards/cosine_scaled_reward": -0.03682881221175194, + "step": 423 + }, + { + "clip_fraction": 0.0, + "completion_length": 2725.6041946411133, + "epoch": 0.4845714285714286, + "grad_norm": 0.025675497949123383, + "kl": 0.00032907724380493164, + "lambda_div_used": 0.6430495753884315, + "learning_rate": 1.6346804638120098e-07, + "loss": -0.0044, + "reward": -0.06141174025833607, + "reward_after_mean": -0.06141174025833607, + "reward_after_std": 0.6873566564172506, + "reward_before_mean": 0.2285282697994262, + "reward_before_std": 0.664992194622755, + "reward_change_max": 0.0, + "reward_change_mean": -0.28994001634418964, + "reward_change_min": -0.5423923581838608, + "reward_change_std": 0.20123817585408688, + "reward_std": 0.6873566769063473, + "rewards/accuracy_reward": 0.25000000931322575, + "rewards/cosine_scaled_reward": -0.021471746265888214, + "step": 424 + }, + { + "clip_fraction": 0.0, + "completion_length": 1639.2292175292969, + "epoch": 0.4857142857142857, + "grad_norm": 0.025542836636304855, + "kl": 0.00018703937530517578, + "lambda_div_used": 0.6579956188797951, + "learning_rate": 1.6186884885673413e-07, + "loss": 0.0441, + "reward": 0.5941350422799587, + "reward_after_mean": 0.5941350422799587, + "reward_after_std": 0.8043180033564568, + "reward_before_mean": 1.2064684219658375, + "reward_before_std": 0.7356861205771565, + "reward_change_max": 0.0, + "reward_change_mean": -0.6123334169387817, + "reward_change_min": -0.9342719316482544, + "reward_change_std": 0.38359352573752403, + "reward_std": 0.804318018257618, + "rewards/accuracy_reward": 0.7500000186264515, + "rewards/cosine_scaled_reward": 0.456468403339386, + "step": 425 + }, + { + "clip_fraction": 0.0, + "completion_length": 2131.9375381469727, + "epoch": 0.4868571428571429, + "grad_norm": 0.029882676899433136, + "kl": 0.00031810998916625977, + "lambda_div_used": 0.6198792308568954, + "learning_rate": 1.6028856829700258e-07, + "loss": -0.0153, + "reward": 0.03275429271161556, + "reward_after_mean": 0.03275429271161556, + "reward_after_std": 0.6299177911132574, + "reward_before_mean": 0.4296752456575632, + "reward_before_std": 0.5544933034107089, + "reward_change_max": 0.0, + "reward_change_mean": -0.3969209287315607, + "reward_change_min": -0.6235288828611374, + "reward_change_std": 0.24328004382550716, + "reward_std": 0.6299178209155798, + "rewards/accuracy_reward": 0.37500000931322575, + "rewards/cosine_scaled_reward": 0.05467522703111172, + "step": 426 + }, + { + "clip_fraction": 0.0, + "completion_length": 3065.3958587646484, + "epoch": 0.488, + "grad_norm": 0.024560794234275818, + "kl": 0.0003638267517089844, + "lambda_div_used": 0.5728301778435707, + "learning_rate": 1.5872728172265146e-07, + "loss": 0.0713, + "reward": -0.181079788133502, + "reward_after_mean": -0.181079788133502, + "reward_after_std": 0.4291039705276489, + "reward_before_mean": 0.19762573204934597, + "reward_before_std": 0.3286724528297782, + "reward_change_max": 0.0, + "reward_change_mean": -0.3787055220454931, + "reward_change_min": -0.5433184914290905, + "reward_change_std": 0.20709905866533518, + "reward_std": 0.4291039779782295, + "rewards/accuracy_reward": 0.25000000558793545, + "rewards/cosine_scaled_reward": -0.05237427353858948, + "step": 427 + }, + { + "clip_fraction": 0.0, + "completion_length": 2331.416702270508, + "epoch": 0.48914285714285716, + "grad_norm": 0.027837947010993958, + "kl": 0.00027292966842651367, + "lambda_div_used": 0.6576317623257637, + "learning_rate": 1.5718506522858572e-07, + "loss": 0.0627, + "reward": 0.037938148714601994, + "reward_after_mean": 0.037938148714601994, + "reward_after_std": 0.7411033473908901, + "reward_before_mean": 0.3408977910876274, + "reward_before_std": 0.7301055882126093, + "reward_change_max": 0.0, + "reward_change_mean": -0.3029596321284771, + "reward_change_min": -0.5550829358398914, + "reward_change_std": 0.21431603003293276, + "reward_std": 0.7411033622920513, + "rewards/accuracy_reward": 0.3125000074505806, + "rewards/cosine_scaled_reward": 0.028397773392498493, + "step": 428 + }, + { + "clip_fraction": 0.0, + "completion_length": 2035.8750381469727, + "epoch": 0.49028571428571427, + "grad_norm": 0.0320022888481617, + "kl": 0.00036203861236572266, + "lambda_div_used": 0.6039041504263878, + "learning_rate": 1.5566199398026147e-07, + "loss": -0.0493, + "reward": -0.09264844097197056, + "reward_after_mean": -0.09264844097197056, + "reward_after_std": 0.5593565441668034, + "reward_before_mean": 0.28112196549773216, + "reward_before_std": 0.4800149817019701, + "reward_change_max": 0.0, + "reward_change_mean": -0.3737703934311867, + "reward_change_min": -0.602749090641737, + "reward_change_std": 0.22795243095606565, + "reward_std": 0.5593565553426743, + "rewards/accuracy_reward": 0.27083333767950535, + "rewards/cosine_scaled_reward": 0.010288612451404333, + "step": 429 + }, + { + "clip_fraction": 0.0, + "completion_length": 2334.7083778381348, + "epoch": 0.49142857142857144, + "grad_norm": 0.02567731775343418, + "kl": 0.00031498074531555176, + "lambda_div_used": 0.609040379524231, + "learning_rate": 1.5415814221002265e-07, + "loss": 0.0017, + "reward": -0.11068469006568193, + "reward_after_mean": -0.11068469006568193, + "reward_after_std": 0.5843690279871225, + "reward_before_mean": 0.24647100269794464, + "reward_before_std": 0.5133073255419731, + "reward_change_max": 0.0, + "reward_change_mean": -0.35715569369494915, + "reward_change_min": -0.6265733800828457, + "reward_change_std": 0.23320611286908388, + "reward_std": 0.5843690391629934, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/cosine_scaled_reward": -0.02436233009211719, + "step": 430 + }, + { + "clip_fraction": 0.0, + "completion_length": 2203.125015258789, + "epoch": 0.49257142857142855, + "grad_norm": 0.030070627108216286, + "kl": 0.0003618001937866211, + "lambda_div_used": 0.5612699165940285, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.0147, + "reward": -0.21507295966148376, + "reward_after_mean": -0.21507295966148376, + "reward_after_std": 0.37470250017941, + "reward_before_mean": 0.18050049245357513, + "reward_before_std": 0.2745527196675539, + "reward_change_max": 0.0, + "reward_change_mean": -0.39557345397770405, + "reward_change_min": -0.5842532999813557, + "reward_change_std": 0.21895906049758196, + "reward_std": 0.37470250204205513, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/cosine_scaled_reward": -0.027832843363285065, + "step": 431 + }, + { + "clip_fraction": 0.0, + "completion_length": 2875.2917098999023, + "epoch": 0.4937142857142857, + "grad_norm": 0.021773481741547585, + "kl": 0.00034308433532714844, + "lambda_div_used": 0.6463272646069527, + "learning_rate": 1.5120838934595337e-07, + "loss": -0.0175, + "reward": 0.0671270489692688, + "reward_after_mean": 0.0671270489692688, + "reward_after_std": 0.6703518275171518, + "reward_before_mean": 0.3971955068409443, + "reward_before_std": 0.6849911892786622, + "reward_change_max": 0.0, + "reward_change_mean": -0.3300684615969658, + "reward_change_min": -0.6204027272760868, + "reward_change_std": 0.24090207554399967, + "reward_std": 0.6703518535941839, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/cosine_scaled_reward": 0.06386216171085835, + "step": 432 + }, + { + "clip_fraction": 0.0, + "completion_length": 2982.0000610351562, + "epoch": 0.4948571428571429, + "grad_norm": 0.021818527951836586, + "kl": 0.0003089308738708496, + "lambda_div_used": 0.5629478171467781, + "learning_rate": 1.4976263201891613e-07, + "loss": -0.0006, + "reward": -0.06292321160435677, + "reward_after_mean": -0.06292321160435677, + "reward_after_std": 0.43814039044082165, + "reward_before_mean": 0.4110525958240032, + "reward_before_std": 0.28899803664535284, + "reward_change_max": 0.0, + "reward_change_mean": -0.4739758223295212, + "reward_change_min": -0.672174334526062, + "reward_change_std": 0.25748884305357933, + "reward_std": 0.4381403960287571, + "rewards/accuracy_reward": 0.3333333358168602, + "rewards/cosine_scaled_reward": 0.07771925255656242, + "step": 433 + }, + { + "clip_fraction": 0.0, + "completion_length": 2822.6458587646484, + "epoch": 0.496, + "grad_norm": 0.024085119366645813, + "kl": 0.0003399848937988281, + "lambda_div_used": 0.5637771561741829, + "learning_rate": 1.483363816965435e-07, + "loss": 0.0353, + "reward": -0.39865921065211296, + "reward_after_mean": -0.39865921065211296, + "reward_after_std": 0.34102493710815907, + "reward_before_mean": -0.12386159785091877, + "reward_before_std": 0.2913210419937968, + "reward_change_max": 0.0, + "reward_change_mean": -0.2747976202517748, + "reward_change_min": -0.4381771683692932, + "reward_change_std": 0.16477954387664795, + "reward_std": 0.34102495945990086, + "rewards/accuracy_reward": 0.0625, + "rewards/cosine_scaled_reward": -0.18636160157620907, + "step": 434 + }, + { + "clip_fraction": 0.0, + "completion_length": 2098.333366394043, + "epoch": 0.49714285714285716, + "grad_norm": 0.03476332873106003, + "kl": 0.0004093945026397705, + "lambda_div_used": 0.561927042901516, + "learning_rate": 1.469297078922642e-07, + "loss": 0.012, + "reward": -0.23742017894983292, + "reward_after_mean": -0.23742017894983292, + "reward_after_std": 0.3864587936550379, + "reward_before_mean": 0.1438802983611822, + "reward_before_std": 0.28451414965093136, + "reward_change_max": 0.0, + "reward_change_mean": -0.3813004810363054, + "reward_change_min": -0.5687282234430313, + "reward_change_std": 0.21544194873422384, + "reward_std": 0.3864588178694248, + "rewards/accuracy_reward": 0.1875, + "rewards/cosine_scaled_reward": -0.043619705364108086, + "step": 435 + }, + { + "clip_fraction": 0.0, + "completion_length": 1956.6666984558105, + "epoch": 0.4982857142857143, + "grad_norm": 0.03724474087357521, + "kl": 0.00024643540382385254, + "lambda_div_used": 0.5972427576780319, + "learning_rate": 1.4554267916537495e-07, + "loss": -0.0215, + "reward": 0.24212833493947983, + "reward_after_mean": 0.24212833493947983, + "reward_after_std": 0.5586434360593557, + "reward_before_mean": 0.7999392561614513, + "reward_before_std": 0.44878256041556597, + "reward_change_max": 0.0, + "reward_change_mean": -0.5578109100461006, + "reward_change_min": -0.789992418140173, + "reward_change_std": 0.32050481624901295, + "reward_std": 0.5586434435099363, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/cosine_scaled_reward": 0.27910589799284935, + "step": 436 + }, + { + "clip_fraction": 0.0, + "completion_length": 2701.1458587646484, + "epoch": 0.49942857142857144, + "grad_norm": 0.02587887831032276, + "kl": 0.0003287792205810547, + "lambda_div_used": 0.5530718564987183, + "learning_rate": 1.4417536311769885e-07, + "loss": -0.0406, + "reward": -0.2597166027408093, + "reward_after_mean": -0.2597166027408093, + "reward_after_std": 0.3689497411251068, + "reward_before_mean": 0.13261681143194437, + "reward_before_std": 0.23820086661726236, + "reward_change_max": 0.0, + "reward_change_mean": -0.39233342185616493, + "reward_change_min": -0.5372026227414608, + "reward_change_std": 0.20350486412644386, + "reward_std": 0.3689497448503971, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/cosine_scaled_reward": -0.013216521823778749, + "step": 437 + }, + { + "clip_fraction": 0.0, + "completion_length": 2737.6667404174805, + "epoch": 0.5005714285714286, + "grad_norm": 0.02056184597313404, + "kl": 0.0002714395523071289, + "lambda_div_used": 0.6291297823190689, + "learning_rate": 1.4282782639029128e-07, + "loss": -0.0391, + "reward": 0.011486291885375977, + "reward_after_mean": 0.011486291885375977, + "reward_after_std": 0.6032967660576105, + "reward_before_mean": 0.34824367985129356, + "reward_before_std": 0.6035201866179705, + "reward_change_max": 0.0, + "reward_change_mean": -0.3367573842406273, + "reward_change_min": -0.588931929320097, + "reward_change_std": 0.23199212551116943, + "reward_std": 0.6032967790961266, + "rewards/accuracy_reward": 0.2916666753590107, + "rewards/cosine_scaled_reward": 0.05657700449228287, + "step": 438 + }, + { + "clip_fraction": 0.0, + "completion_length": 2417.875057220459, + "epoch": 0.5017142857142857, + "grad_norm": 0.02567973919212818, + "kl": 0.00029343366622924805, + "lambda_div_used": 0.6277910619974136, + "learning_rate": 1.4150013466019114e-07, + "loss": 0.013, + "reward": -0.039602138102054596, + "reward_after_mean": -0.039602138102054596, + "reward_after_std": 0.6028888281434774, + "reward_before_mean": 0.2732508610934019, + "reward_before_std": 0.5980509808287024, + "reward_change_max": 0.0, + "reward_change_mean": -0.3128530103713274, + "reward_change_min": -0.5787924043834209, + "reward_change_std": 0.21938505861908197, + "reward_std": 0.6028888486325741, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/cosine_scaled_reward": 0.023250849917531013, + "step": 439 + }, + { + "clip_fraction": 0.0, + "completion_length": 2942.770835876465, + "epoch": 0.5028571428571429, + "grad_norm": 0.028702648356556892, + "kl": 0.00038042664527893066, + "lambda_div_used": 0.5716730058193207, + "learning_rate": 1.4019235263722034e-07, + "loss": -0.0684, + "reward": -0.40989339258521795, + "reward_after_mean": -0.40989339258521795, + "reward_after_std": 0.42086669616401196, + "reward_before_mean": -0.15851380862295628, + "reward_before_std": 0.3258522395044565, + "reward_change_max": 0.0, + "reward_change_mean": -0.2513795755803585, + "reward_change_min": -0.3500328026711941, + "reward_change_std": 0.13096946012228727, + "reward_std": 0.42086671106517315, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/cosine_scaled_reward": -0.2001804756000638, + "step": 440 + }, + { + "clip_fraction": 0.0, + "completion_length": 2898.250015258789, + "epoch": 0.504, + "grad_norm": 0.025851793587207794, + "kl": 0.0004258155822753906, + "lambda_div_used": 0.5947419032454491, + "learning_rate": 1.3890454406082956e-07, + "loss": 0.0188, + "reward": -0.09268893860280514, + "reward_after_mean": -0.09268893860280514, + "reward_after_std": 0.527678394690156, + "reward_before_mean": 0.28031823271885514, + "reward_before_std": 0.43342068372294307, + "reward_change_max": 0.0, + "reward_change_mean": -0.373007170855999, + "reward_change_min": -0.5688190311193466, + "reward_change_std": 0.2149599390104413, + "reward_std": 0.5276784114539623, + "rewards/accuracy_reward": 0.31250000558793545, + "rewards/cosine_scaled_reward": -0.03218177333474159, + "step": 441 + }, + { + "clip_fraction": 0.0, + "completion_length": 2738.312530517578, + "epoch": 0.5051428571428571, + "grad_norm": 0.024453362450003624, + "kl": 0.0003438591957092285, + "lambda_div_used": 0.6373191103339195, + "learning_rate": 1.3763677169699217e-07, + "loss": -0.0098, + "reward": -0.11879788711667061, + "reward_after_mean": -0.11879788711667061, + "reward_after_std": 0.6699451506137848, + "reward_before_mean": 0.15645072294864804, + "reward_before_std": 0.6331657916307449, + "reward_change_max": 0.0, + "reward_change_mean": -0.27524859458208084, + "reward_change_min": -0.4614573121070862, + "reward_change_std": 0.1758969947695732, + "reward_std": 0.669945165514946, + "rewards/accuracy_reward": 0.20833334140479565, + "rewards/cosine_scaled_reward": -0.051882621832191944, + "step": 442 + }, + { + "clip_fraction": 0.0, + "completion_length": 3111.3541870117188, + "epoch": 0.5062857142857143, + "grad_norm": 0.018424130976200104, + "kl": 0.0003415346145629883, + "lambda_div_used": 0.594095878303051, + "learning_rate": 1.3638909733514452e-07, + "loss": -0.0021, + "reward": -0.13722801115363836, + "reward_after_mean": -0.13722801115363836, + "reward_after_std": 0.5484136454761028, + "reward_before_mean": 0.23274649307131767, + "reward_before_std": 0.4324050806462765, + "reward_change_max": 0.0, + "reward_change_mean": -0.36997453309595585, + "reward_change_min": -0.530858725309372, + "reward_change_std": 0.2040829285979271, + "reward_std": 0.5484136454761028, + "rewards/accuracy_reward": 0.2500000037252903, + "rewards/cosine_scaled_reward": -0.01725347526371479, + "step": 443 + }, + { + "clip_fraction": 0.0, + "completion_length": 2850.6666984558105, + "epoch": 0.5074285714285715, + "grad_norm": 0.023973438888788223, + "kl": 0.0003078579902648926, + "lambda_div_used": 0.5948461815714836, + "learning_rate": 1.351615817851748e-07, + "loss": -0.0055, + "reward": -0.1747817099094391, + "reward_after_mean": -0.1747817099094391, + "reward_after_std": 0.4692641645669937, + "reward_before_mean": 0.1552269384264946, + "reward_before_std": 0.4344164803624153, + "reward_change_max": 0.0, + "reward_change_mean": -0.3300086557865143, + "reward_change_min": -0.4955419562757015, + "reward_change_std": 0.19987357687205076, + "reward_std": 0.4692641757428646, + "rewards/accuracy_reward": 0.20833334140479565, + "rewards/cosine_scaled_reward": -0.05310639180243015, + "step": 444 + }, + { + "clip_fraction": 0.0, + "completion_length": 2896.1458892822266, + "epoch": 0.5085714285714286, + "grad_norm": 0.021344272419810295, + "kl": 0.000371396541595459, + "lambda_div_used": 0.5911799594759941, + "learning_rate": 1.3395428487445914e-07, + "loss": 0.0204, + "reward": -0.09663986414670944, + "reward_after_mean": -0.09663986414670944, + "reward_after_std": 0.5167377535253763, + "reward_before_mean": 0.29358627926558256, + "reward_before_std": 0.4120226204395294, + "reward_change_max": 0.0, + "reward_change_mean": -0.3902261406183243, + "reward_change_min": -0.5846884902566671, + "reward_change_std": 0.21900581941008568, + "reward_std": 0.5167377851903439, + "rewards/accuracy_reward": 0.31250000558793545, + "rewards/cosine_scaled_reward": -0.018913742154836655, + "step": 445 + }, + { + "clip_fraction": 0.0, + "completion_length": 2849.7083892822266, + "epoch": 0.5097142857142857, + "grad_norm": 0.020637815818190575, + "kl": 0.0003114938735961914, + "lambda_div_used": 0.5921742841601372, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.0358, + "reward": -0.19162439927458763, + "reward_after_mean": -0.19162439927458763, + "reward_after_std": 0.4613885171711445, + "reward_before_mean": 0.12132475152611732, + "reward_before_std": 0.42135200183838606, + "reward_change_max": 0.0, + "reward_change_mean": -0.31294916570186615, + "reward_change_min": -0.5426378659904003, + "reward_change_std": 0.196873115375638, + "reward_std": 0.4613885283470154, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/cosine_scaled_reward": -0.06617523916065693, + "step": 446 + }, + { + "clip_fraction": 0.0, + "completion_length": 1833.833351135254, + "epoch": 0.5108571428571429, + "grad_norm": 0.03296150267124176, + "kl": 0.00031435489654541016, + "lambda_div_used": 0.5761597007513046, + "learning_rate": 1.316005813502869e-07, + "loss": -0.0092, + "reward": -0.1308344192802906, + "reward_after_mean": -0.1308344192802906, + "reward_after_std": 0.44760639779269695, + "reward_before_mean": 0.27170680463314056, + "reward_before_std": 0.3468586690723896, + "reward_change_max": 0.0, + "reward_change_mean": -0.40254124999046326, + "reward_change_min": -0.6140144616365433, + "reward_change_std": 0.23130866140127182, + "reward_std": 0.44760641269385815, + "rewards/accuracy_reward": 0.2708333395421505, + "rewards/cosine_scaled_reward": 0.0008734846487641335, + "step": 447 + }, + { + "clip_fraction": 0.0, + "completion_length": 2031.1875228881836, + "epoch": 0.512, + "grad_norm": 0.03561374545097351, + "kl": 0.0002592802047729492, + "lambda_div_used": 0.5793976187705994, + "learning_rate": 1.3045428945301953e-07, + "loss": 0.0625, + "reward": -0.15141154546290636, + "reward_after_mean": -0.15141154546290636, + "reward_after_std": 0.47303674556314945, + "reward_before_mean": 0.2355510238558054, + "reward_before_std": 0.36865185387432575, + "reward_change_max": 0.0, + "reward_change_mean": -0.38696256279945374, + "reward_change_min": -0.6033525615930557, + "reward_change_std": 0.22703420650213957, + "reward_std": 0.47303677164018154, + "rewards/accuracy_reward": 0.2291666679084301, + "rewards/cosine_scaled_reward": 0.00638435548171401, + "step": 448 + }, + { + "clip_fraction": 0.0, + "completion_length": 2527.416679382324, + "epoch": 0.5131428571428571, + "grad_norm": 0.028000032529234886, + "kl": 0.0003337860107421875, + "lambda_div_used": 0.5869953334331512, + "learning_rate": 1.2932844562179352e-07, + "loss": -0.0384, + "reward": -0.2571147223934531, + "reward_after_mean": -0.2571147223934531, + "reward_after_std": 0.44822895526885986, + "reward_before_mean": 0.04588266555219889, + "reward_before_std": 0.39473184011876583, + "reward_change_max": 0.0, + "reward_change_mean": -0.30299739353358746, + "reward_change_min": -0.4681765213608742, + "reward_change_std": 0.17783036269247532, + "reward_std": 0.4482289757579565, + "rewards/accuracy_reward": 0.20833334140479565, + "rewards/cosine_scaled_reward": -0.1624506814405322, + "step": 449 + }, + { + "clip_fraction": 0.0, + "completion_length": 2360.3125228881836, + "epoch": 0.5142857142857142, + "grad_norm": 0.028984738513827324, + "kl": 0.00030869245529174805, + "lambda_div_used": 0.5782932788133621, + "learning_rate": 1.2822310472864885e-07, + "loss": -0.0103, + "reward": -0.1581341177225113, + "reward_after_mean": -0.1581341177225113, + "reward_after_std": 0.45438094437122345, + "reward_before_mean": 0.22925141779705882, + "reward_before_std": 0.3587577445432544, + "reward_change_max": 0.0, + "reward_change_mean": -0.3873855248093605, + "reward_change_min": -0.5598408095538616, + "reward_change_std": 0.21874273754656315, + "reward_std": 0.4543809536844492, + "rewards/accuracy_reward": 0.22916666977107525, + "rewards/cosine_scaled_reward": 8.474662899971008e-05, + "step": 450 + }, + { + "clip_fraction": 0.0, + "completion_length": 2468.1667098999023, + "epoch": 0.5154285714285715, + "grad_norm": 0.035742077976465225, + "kl": 0.0004057884216308594, + "lambda_div_used": 0.5319794341921806, + "learning_rate": 1.2713832064634125e-07, + "loss": -0.0206, + "reward": -0.24930068850517273, + "reward_after_mean": -0.24930068850517273, + "reward_after_std": 0.3233966138213873, + "reward_before_mean": 0.2043198449537158, + "reward_before_std": 0.1404099608771503, + "reward_change_max": 0.0, + "reward_change_mean": -0.4536205381155014, + "reward_change_min": -0.6192456483840942, + "reward_change_std": 0.23171372152864933, + "reward_std": 0.3233966249972582, + "rewards/accuracy_reward": 0.25, + "rewards/cosine_scaled_reward": -0.04568016994744539, + "step": 451 + }, + { + "clip_fraction": 0.0, + "completion_length": 2740.4166831970215, + "epoch": 0.5165714285714286, + "grad_norm": 0.02392762340605259, + "kl": 0.0003084242343902588, + "lambda_div_used": 0.6302760690450668, + "learning_rate": 1.260741462457165e-07, + "loss": -0.0038, + "reward": 0.021876126527786255, + "reward_after_mean": 0.021876126527786255, + "reward_after_std": 0.5877971854060888, + "reward_before_mean": 0.37123518623411655, + "reward_before_std": 0.6054041795432568, + "reward_change_max": 0.0, + "reward_change_mean": -0.34935908019542694, + "reward_change_min": -0.593828123062849, + "reward_change_std": 0.2436074260622263, + "reward_std": 0.5877972133457661, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/cosine_scaled_reward": 0.03790186531841755, + "step": 452 + }, + { + "clip_fraction": 0.0, + "completion_length": 2600.1667518615723, + "epoch": 0.5177142857142857, + "grad_norm": 0.03355313092470169, + "kl": 0.0003637075424194336, + "lambda_div_used": 0.6487660184502602, + "learning_rate": 1.2503063339313356e-07, + "loss": 0.0766, + "reward": 0.08280018530786037, + "reward_after_mean": 0.08280018530786037, + "reward_after_std": 0.721244465559721, + "reward_before_mean": 0.43781263194978237, + "reward_before_std": 0.6883194223046303, + "reward_change_max": 0.0, + "reward_change_mean": -0.35501245222985744, + "reward_change_min": -0.6027462910860777, + "reward_change_std": 0.2280629277229309, + "reward_std": 0.7212444879114628, + "rewards/accuracy_reward": 0.3541666753590107, + "rewards/cosine_scaled_reward": 0.08364596217870712, + "step": 453 + }, + { + "clip_fraction": 0.0, + "completion_length": 2229.520866394043, + "epoch": 0.5188571428571429, + "grad_norm": 0.026976440101861954, + "kl": 0.0003204345703125, + "lambda_div_used": 0.586439348757267, + "learning_rate": 1.2400783294793668e-07, + "loss": 0.0213, + "reward": -0.06522449851036072, + "reward_after_mean": -0.06522449851036072, + "reward_after_std": 0.4758566189557314, + "reward_before_mean": 0.34830280393362045, + "reward_before_std": 0.4015544820576906, + "reward_change_max": 0.0, + "reward_change_mean": -0.4135272856801748, + "reward_change_min": -0.6215804703533649, + "reward_change_std": 0.24525572545826435, + "reward_std": 0.47585663571953773, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/cosine_scaled_reward": 0.05663611739873886, + "step": 454 + }, + { + "clip_fraction": 0.0, + "completion_length": 2910.729217529297, + "epoch": 0.52, + "grad_norm": 0.022959912195801735, + "kl": 0.0003757178783416748, + "lambda_div_used": 0.575455017387867, + "learning_rate": 1.2300579475997657e-07, + "loss": 0.0464, + "reward": -0.396820537163876, + "reward_after_mean": -0.396820537163876, + "reward_after_std": 0.42159392312169075, + "reward_before_mean": -0.14198972191661596, + "reward_before_std": 0.3413227070122957, + "reward_change_max": 0.0, + "reward_change_mean": -0.2548308204859495, + "reward_change_min": -0.36302991211414337, + "reward_change_std": 0.13639382366091013, + "reward_std": 0.4215939249843359, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/cosine_scaled_reward": -0.18365638982504606, + "step": 455 + }, + { + "clip_fraction": 0.0, + "completion_length": 2996.0417098999023, + "epoch": 0.5211428571428571, + "grad_norm": 0.022153589874505997, + "kl": 0.0003063082695007324, + "lambda_div_used": 0.6117950826883316, + "learning_rate": 1.220245676671809e-07, + "loss": 0.03, + "reward": -0.22662064619362354, + "reward_after_mean": -0.22662064619362354, + "reward_after_std": 0.5622703209519386, + "reward_before_mean": 0.03414517780765891, + "reward_before_std": 0.5151250278577209, + "reward_change_max": 0.0, + "reward_change_mean": -0.26076582819223404, + "reward_change_min": -0.43476971983909607, + "reward_change_std": 0.165956006385386, + "reward_std": 0.5622703321278095, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/cosine_scaled_reward": -0.13252148625906557, + "step": 456 + }, + { + "clip_fraction": 0.0, + "completion_length": 2978.187530517578, + "epoch": 0.5222857142857142, + "grad_norm": 0.0224411953240633, + "kl": 0.0004011392593383789, + "lambda_div_used": 0.5359829142689705, + "learning_rate": 1.2106419949317388e-07, + "loss": -0.0311, + "reward": -0.2614034563302994, + "reward_after_mean": -0.2614034563302994, + "reward_after_std": 0.33165648579597473, + "reward_before_mean": 0.16703256964683533, + "reward_before_std": 0.157981239259243, + "reward_change_max": 0.0, + "reward_change_mean": -0.42843602411448956, + "reward_change_min": -0.5874424390494823, + "reward_change_std": 0.21858325507491827, + "reward_std": 0.3316564913839102, + "rewards/accuracy_reward": 0.25, + "rewards/cosine_scaled_reward": -0.08296743780374527, + "step": 457 + }, + { + "clip_fraction": 0.0, + "completion_length": 2119.250026702881, + "epoch": 0.5234285714285715, + "grad_norm": 0.03472839295864105, + "kl": 0.00028389692306518555, + "lambda_div_used": 0.6622679010033607, + "learning_rate": 1.2012473704494537e-07, + "loss": 0.0396, + "reward": 0.09622732177376747, + "reward_after_mean": 0.09622732177376747, + "reward_after_std": 0.7549249790608883, + "reward_before_mean": 0.4134064484387636, + "reward_before_std": 0.7546715997159481, + "reward_change_max": 0.0, + "reward_change_mean": -0.3171791285276413, + "reward_change_min": -0.5532067231833935, + "reward_change_std": 0.2206783127039671, + "reward_std": 0.7549249865114689, + "rewards/accuracy_reward": 0.3541666753590107, + "rewards/cosine_scaled_reward": 0.05923975070982124, + "step": 458 + }, + { + "clip_fraction": 0.0, + "completion_length": 1308.0000534057617, + "epoch": 0.5245714285714286, + "grad_norm": 0.034576416015625, + "kl": 0.00019240379333496094, + "lambda_div_used": 0.6211641430854797, + "learning_rate": 1.1920622611056974e-07, + "loss": 0.0136, + "reward": 0.10968395322561264, + "reward_after_mean": 0.10968395322561264, + "reward_after_std": 0.642698410898447, + "reward_before_mean": 0.5380105744116008, + "reward_before_std": 0.5641936575993896, + "reward_change_max": 0.0, + "reward_change_mean": -0.42832658998668194, + "reward_change_min": -0.647167906165123, + "reward_change_std": 0.26214463263750076, + "reward_std": 0.6426984257996082, + "rewards/accuracy_reward": 0.37500000558793545, + "rewards/cosine_scaled_reward": 0.16301053576171398, + "step": 459 + }, + { + "clip_fraction": 0.0, + "completion_length": 2947.2916870117188, + "epoch": 0.5257142857142857, + "grad_norm": 0.022817425429821014, + "kl": 0.00036215782165527344, + "lambda_div_used": 0.6313923373818398, + "learning_rate": 1.1830871145697412e-07, + "loss": 0.0374, + "reward": -0.143511475995183, + "reward_after_mean": -0.143511475995183, + "reward_after_std": 0.6310819126665592, + "reward_before_mean": 0.12019729614257812, + "reward_before_std": 0.6178826270624995, + "reward_change_max": 0.0, + "reward_change_mean": -0.26370877772569656, + "reward_change_min": -0.5500058270990849, + "reward_change_std": 0.197435456328094, + "reward_std": 0.6310819499194622, + "rewards/accuracy_reward": 0.1875000037252903, + "rewards/cosine_scaled_reward": -0.06730269826948643, + "step": 460 + }, + { + "clip_fraction": 0.0, + "completion_length": 3020.604232788086, + "epoch": 0.5268571428571428, + "grad_norm": 0.019895615056157112, + "kl": 0.0003611445426940918, + "lambda_div_used": 0.6468427553772926, + "learning_rate": 1.1743223682775649e-07, + "loss": 0.0226, + "reward": 0.13486449420452118, + "reward_after_mean": 0.13486449420452118, + "reward_after_std": 0.6662128213793039, + "reward_before_mean": 0.5074618738144636, + "reward_before_std": 0.6824358962476254, + "reward_change_max": 0.0, + "reward_change_mean": -0.3725973889231682, + "reward_change_min": -0.659518338739872, + "reward_change_std": 0.26068645529448986, + "reward_std": 0.6662128381431103, + "rewards/accuracy_reward": 0.3750000111758709, + "rewards/cosine_scaled_reward": 0.1324618849903345, + "step": 461 + }, + { + "clip_fraction": 0.0, + "completion_length": 2896.958354949951, + "epoch": 0.528, + "grad_norm": 0.027659112587571144, + "kl": 0.0003952980041503906, + "lambda_div_used": 0.5935230925679207, + "learning_rate": 1.1657684494105386e-07, + "loss": -0.0084, + "reward": -0.32939455355517566, + "reward_after_mean": -0.32939455355517566, + "reward_after_std": 0.4861418064683676, + "reward_before_mean": -0.06984398560598493, + "reward_before_std": 0.4248745897784829, + "reward_change_max": 0.0, + "reward_change_mean": -0.25955056957900524, + "reward_change_min": -0.4244098737835884, + "reward_change_std": 0.1560937762260437, + "reward_std": 0.4861418195068836, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/cosine_scaled_reward": -0.15317732468247414, + "step": 462 + }, + { + "clip_fraction": 0.0, + "completion_length": 2485.375030517578, + "epoch": 0.5291428571428571, + "grad_norm": 0.02032829262316227, + "kl": 0.00022867321968078613, + "lambda_div_used": 0.6013497039675713, + "learning_rate": 1.1574257748745986e-07, + "loss": 0.1063, + "reward": 0.004297456704080105, + "reward_after_mean": 0.004297456704080105, + "reward_after_std": 0.5376875698566437, + "reward_before_mean": 0.42904988676309586, + "reward_before_std": 0.46889279037714005, + "reward_change_max": 0.0, + "reward_change_mean": -0.4247523993253708, + "reward_change_min": -0.6676977872848511, + "reward_change_std": 0.26014791429042816, + "reward_std": 0.5376875959336758, + "rewards/accuracy_reward": 0.3333333395421505, + "rewards/cosine_scaled_reward": 0.09571653697639704, + "step": 463 + }, + { + "clip_fraction": 0.0, + "completion_length": 1664.770881652832, + "epoch": 0.5302857142857142, + "grad_norm": 0.03104904294013977, + "kl": 0.0002219080924987793, + "lambda_div_used": 0.6030265837907791, + "learning_rate": 1.1492947512799328e-07, + "loss": -0.0781, + "reward": 0.027926755137741566, + "reward_after_mean": 0.027926755137741566, + "reward_after_std": 0.5750233307480812, + "reward_before_mean": 0.4541918604518287, + "reward_before_std": 0.4690140914171934, + "reward_change_max": 0.0, + "reward_change_mean": -0.4262651167809963, + "reward_change_min": -0.597686804831028, + "reward_change_std": 0.22887016367167234, + "reward_std": 0.5750233307480812, + "rewards/accuracy_reward": 0.4166666679084301, + "rewards/cosine_scaled_reward": 0.03752519562840462, + "step": 464 + }, + { + "clip_fraction": 0.0, + "completion_length": 2696.979248046875, + "epoch": 0.5314285714285715, + "grad_norm": 0.02595655620098114, + "kl": 0.0003947019577026367, + "lambda_div_used": 0.6469813883304596, + "learning_rate": 1.1413757749211602e-07, + "loss": 0.0015, + "reward": -0.005829358473420143, + "reward_after_mean": -0.005829358473420143, + "reward_after_std": 0.7018453720957041, + "reward_before_mean": 0.2963540703058243, + "reward_before_std": 0.687056201742962, + "reward_change_max": 0.0, + "reward_change_mean": -0.30218344181776047, + "reward_change_min": -0.5677898563444614, + "reward_change_std": 0.2154219476506114, + "reward_std": 0.7018453869968653, + "rewards/accuracy_reward": 0.25000000558793545, + "rewards/cosine_scaled_reward": 0.04635407403111458, + "step": 465 + }, + { + "clip_fraction": 0.0, + "completion_length": 3051.750030517578, + "epoch": 0.5325714285714286, + "grad_norm": 0.022232649847865105, + "kl": 0.00035768747329711914, + "lambda_div_used": 0.6076068878173828, + "learning_rate": 1.1336692317580158e-07, + "loss": 0.0096, + "reward": -0.09472141414880753, + "reward_after_mean": -0.09472141414880753, + "reward_after_std": 0.5900444928556681, + "reward_before_mean": 0.2662593559361994, + "reward_before_std": 0.5045403479598463, + "reward_change_max": 0.0, + "reward_change_mean": -0.36098079197108746, + "reward_change_min": -0.5804308690130711, + "reward_change_std": 0.22072006110101938, + "reward_std": 0.5900445096194744, + "rewards/accuracy_reward": 0.25000000186264515, + "rewards/cosine_scaled_reward": 0.016259355936199427, + "step": 466 + }, + { + "clip_fraction": 0.0, + "completion_length": 2831.6458702087402, + "epoch": 0.5337142857142857, + "grad_norm": 0.027867048978805542, + "kl": 0.0003955364227294922, + "lambda_div_used": 0.5972240790724754, + "learning_rate": 1.1261754973965422e-07, + "loss": 0.0064, + "reward": -0.16292368434369564, + "reward_after_mean": -0.16292368434369564, + "reward_after_std": 0.5499103963375092, + "reward_before_mean": 0.19015199813293293, + "reward_before_std": 0.4444003812968731, + "reward_change_max": 0.0, + "reward_change_mean": -0.3530757036060095, + "reward_change_min": -0.5379382502287626, + "reward_change_std": 0.1972823329269886, + "reward_std": 0.5499104224145412, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/cosine_scaled_reward": -0.018181337043642998, + "step": 467 + }, + { + "clip_fraction": 0.0, + "completion_length": 2942.583354949951, + "epoch": 0.5348571428571428, + "grad_norm": 0.021755212917923927, + "kl": 0.0003478527069091797, + "lambda_div_used": 0.5782695487141609, + "learning_rate": 1.1188949370707787e-07, + "loss": -0.0004, + "reward": -0.2234923504292965, + "reward_after_mean": -0.2234923504292965, + "reward_after_std": 0.4573483895510435, + "reward_before_mean": 0.13846815121360123, + "reward_before_std": 0.36004857218358666, + "reward_change_max": 0.0, + "reward_change_mean": -0.36196050979197025, + "reward_change_min": -0.5418836548924446, + "reward_change_std": 0.2078724503517151, + "reward_std": 0.45734839886426926, + "rewards/accuracy_reward": 0.20833333395421505, + "rewards/cosine_scaled_reward": -0.06986518204212189, + "step": 468 + }, + { + "clip_fraction": 0.0, + "completion_length": 2679.208396911621, + "epoch": 0.536, + "grad_norm": 0.02693931572139263, + "kl": 0.00039780139923095703, + "lambda_div_used": 0.6250492706894875, + "learning_rate": 1.1118279056249653e-07, + "loss": 0.0181, + "reward": 0.01080943364650011, + "reward_after_mean": 0.01080943364650011, + "reward_after_std": 0.6565821636468172, + "reward_before_mean": 0.38899326138198376, + "reward_before_std": 0.5838505062274635, + "reward_change_max": 0.0, + "reward_change_mean": -0.37818380631506443, + "reward_change_min": -0.6336386613547802, + "reward_change_std": 0.23969437181949615, + "reward_std": 0.6565821878612041, + "rewards/accuracy_reward": 0.31250000558793545, + "rewards/cosine_scaled_reward": 0.07649324322119355, + "step": 469 + }, + { + "clip_fraction": 0.0, + "completion_length": 2651.8333740234375, + "epoch": 0.5371428571428571, + "grad_norm": 0.022814009338617325, + "kl": 0.0003286600112915039, + "lambda_div_used": 0.5820565819740295, + "learning_rate": 1.1049747474962444e-07, + "loss": 0.062, + "reward": -0.1548374481499195, + "reward_after_mean": -0.1548374481499195, + "reward_after_std": 0.46373489685356617, + "reward_before_mean": 0.21819785539992154, + "reward_before_std": 0.37470409646630287, + "reward_change_max": 0.0, + "reward_change_mean": -0.3730353116989136, + "reward_change_min": -0.5532267577946186, + "reward_change_std": 0.2110730605199933, + "reward_std": 0.46373490430414677, + "rewards/accuracy_reward": 0.22916666977107525, + "rewards/cosine_scaled_reward": -0.010968813672661781, + "step": 470 + }, + { + "clip_fraction": 0.0, + "completion_length": 2898.7916946411133, + "epoch": 0.5382857142857143, + "grad_norm": 0.01961471140384674, + "kl": 0.00035419315099716187, + "lambda_div_used": 0.580303005874157, + "learning_rate": 1.0983357966978745e-07, + "loss": -0.0592, + "reward": -0.22997316345572472, + "reward_after_mean": -0.22997316345572472, + "reward_after_std": 0.4005854483693838, + "reward_before_mean": 0.09628471545875072, + "reward_before_std": 0.36470284312963486, + "reward_change_max": 0.0, + "reward_change_mean": -0.3262578770518303, + "reward_change_min": -0.5184299051761627, + "reward_change_std": 0.1985421497374773, + "reward_std": 0.40058545023202896, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/cosine_scaled_reward": -0.07038197666406631, + "step": 471 + }, + { + "clip_fraction": 0.0, + "completion_length": 2656.125030517578, + "epoch": 0.5394285714285715, + "grad_norm": 0.020427875220775604, + "kl": 0.00032085180282592773, + "lambda_div_used": 0.6381691917777061, + "learning_rate": 1.0919113768029517e-07, + "loss": -0.0459, + "reward": -0.11792396203964017, + "reward_after_mean": -0.11792396203964017, + "reward_after_std": 0.6783455964177847, + "reward_before_mean": 0.15370581997558475, + "reward_before_std": 0.6358637362718582, + "reward_change_max": 0.0, + "reward_change_mean": -0.2716297823935747, + "reward_change_min": -0.4567250721156597, + "reward_change_std": 0.1737262774258852, + "reward_std": 0.678345600143075, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/cosine_scaled_reward": -0.033794180024415255, + "step": 472 + }, + { + "clip_fraction": 0.0, + "completion_length": 2983.750015258789, + "epoch": 0.5405714285714286, + "grad_norm": 0.02453148551285267, + "kl": 0.0003961324691772461, + "lambda_div_used": 0.6016801968216896, + "learning_rate": 1.0857018009286381e-07, + "loss": 0.0472, + "reward": -0.006093651056289673, + "reward_after_mean": -0.006093651056289673, + "reward_after_std": 0.5333354268223047, + "reward_before_mean": 0.4089024979621172, + "reward_before_std": 0.4693258060142398, + "reward_change_max": 0.0, + "reward_change_mean": -0.4149961844086647, + "reward_change_min": -0.6490769572556019, + "reward_change_std": 0.25427413638681173, + "reward_std": 0.5333354473114014, + "rewards/accuracy_reward": 0.3333333395421505, + "rewards/cosine_scaled_reward": 0.07556918449699879, + "step": 473 + }, + { + "clip_fraction": 0.0, + "completion_length": 2252.104202270508, + "epoch": 0.5417142857142857, + "grad_norm": 0.02836003340780735, + "kl": 0.00038546323776245117, + "lambda_div_used": 0.5747748166322708, + "learning_rate": 1.0797073717209013e-07, + "loss": 0.0473, + "reward": 0.11940666288137436, + "reward_after_mean": 0.11940666288137436, + "reward_after_std": 0.5397788472473621, + "reward_before_mean": 0.7007539421319962, + "reward_before_std": 0.3393053291365504, + "reward_change_max": 0.0, + "reward_change_mean": -0.5813473239541054, + "reward_change_min": -0.7977702841162682, + "reward_change_std": 0.3083681631833315, + "reward_std": 0.5397788546979427, + "rewards/accuracy_reward": 0.47916666977107525, + "rewards/cosine_scaled_reward": 0.22158729657530785, + "step": 474 + }, + { + "clip_fraction": 0.0, + "completion_length": 2133.562568664551, + "epoch": 0.5428571428571428, + "grad_norm": 0.02848704345524311, + "kl": 0.0002709627151489258, + "lambda_div_used": 0.6917188391089439, + "learning_rate": 1.0739283813397639e-07, + "loss": 0.0352, + "reward": 0.2483121044933796, + "reward_after_mean": 0.2483121044933796, + "reward_after_std": 0.851221090182662, + "reward_before_mean": 0.5981418825685978, + "reward_before_std": 0.9012213246896863, + "reward_change_max": 0.0, + "reward_change_mean": -0.34982976876199245, + "reward_change_min": -0.7087801471352577, + "reward_change_std": 0.28033955581486225, + "reward_std": 0.8512210976332426, + "rewards/accuracy_reward": 0.4375000111758709, + "rewards/cosine_scaled_reward": 0.16064186580479145, + "step": 475 + }, + { + "clip_fraction": 0.0, + "completion_length": 2319.3750381469727, + "epoch": 0.544, + "grad_norm": 0.02834029123187065, + "kl": 0.00039076805114746094, + "lambda_div_used": 0.6543590575456619, + "learning_rate": 1.068365111445064e-07, + "loss": 0.0905, + "reward": 0.3722646813839674, + "reward_after_mean": 0.3722646813839674, + "reward_after_std": 0.76073794439435, + "reward_before_mean": 0.8793929517269135, + "reward_before_std": 0.7163430340588093, + "reward_change_max": 0.0, + "reward_change_mean": -0.5071282722055912, + "reward_change_min": -0.8575163669884205, + "reward_change_std": 0.32718705013394356, + "reward_std": 0.7607379760593176, + "rewards/accuracy_reward": 0.5625000074505806, + "rewards/cosine_scaled_reward": 0.31689293240197003, + "step": 476 + }, + { + "clip_fraction": 0.0, + "completion_length": 1388.479190826416, + "epoch": 0.5451428571428572, + "grad_norm": 0.03197532892227173, + "kl": 0.00019982457160949707, + "lambda_div_used": 0.6069512218236923, + "learning_rate": 1.063017833182728e-07, + "loss": 0.0104, + "reward": 0.2797765755094588, + "reward_after_mean": 0.2797765755094588, + "reward_after_std": 0.6170946378260851, + "reward_before_mean": 0.8468085322529078, + "reward_before_std": 0.48898326186463237, + "reward_change_max": 0.0, + "reward_change_mean": -0.5670319274067879, + "reward_change_min": -0.8104716204106808, + "reward_change_std": 0.32000066339969635, + "reward_std": 0.6170946676284075, + "rewards/accuracy_reward": 0.520833345130086, + "rewards/cosine_scaled_reward": 0.325975195504725, + "step": 477 + }, + { + "clip_fraction": 0.0, + "completion_length": 2758.9375228881836, + "epoch": 0.5462857142857143, + "grad_norm": 0.023568512871861458, + "kl": 0.0003166794776916504, + "lambda_div_used": 0.6045826748013496, + "learning_rate": 1.0578868071715544e-07, + "loss": 0.0424, + "reward": 0.06877373531460762, + "reward_after_mean": 0.06877373531460762, + "reward_after_std": 0.5457048490643501, + "reward_before_mean": 0.5106075219810009, + "reward_before_std": 0.4815037827938795, + "reward_change_max": 0.0, + "reward_change_mean": -0.4418338183313608, + "reward_change_min": -0.6827114932239056, + "reward_change_std": 0.2656334117054939, + "reward_std": 0.5457048676908016, + "rewards/accuracy_reward": 0.3750000111758709, + "rewards/cosine_scaled_reward": 0.1356075219810009, + "step": 478 + }, + { + "clip_fraction": 0.0, + "completion_length": 2932.895866394043, + "epoch": 0.5474285714285714, + "grad_norm": 0.025046760216355324, + "kl": 0.00037872791290283203, + "lambda_div_used": 0.5930827036499977, + "learning_rate": 1.0529722834905125e-07, + "loss": 0.0013, + "reward": -0.29585114773362875, + "reward_after_mean": -0.29585114773362875, + "reward_after_std": 0.4981949180364609, + "reward_before_mean": -0.02313473215326667, + "reward_before_std": 0.4286517295986414, + "reward_change_max": 0.0, + "reward_change_mean": -0.27271641232073307, + "reward_change_min": -0.4339797645807266, + "reward_change_std": 0.1596519472077489, + "reward_std": 0.4981949217617512, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/cosine_scaled_reward": -0.12730140378698707, + "step": 479 + }, + { + "clip_fraction": 0.0, + "completion_length": 2238.062515258789, + "epoch": 0.5485714285714286, + "grad_norm": 0.03429793193936348, + "kl": 0.0003292560577392578, + "lambda_div_used": 0.6356522366404533, + "learning_rate": 1.0482745016665526e-07, + "loss": 0.0622, + "reward": -0.19213765393942595, + "reward_after_mean": -0.19213765393942595, + "reward_after_std": 0.6865857243537903, + "reward_before_mean": 0.048641178291291, + "reward_before_std": 0.6263551618903875, + "reward_change_max": 0.0, + "reward_change_mean": -0.24077884666621685, + "reward_change_min": -0.43974681198596954, + "reward_change_std": 0.15421691350638866, + "reward_std": 0.6865857467055321, + "rewards/accuracy_reward": 0.14583333767950535, + "rewards/cosine_scaled_reward": -0.09719215868972242, + "step": 480 + }, + { + "clip_fraction": 0.0, + "completion_length": 2533.666732788086, + "epoch": 0.5497142857142857, + "grad_norm": 0.01749316044151783, + "kl": 0.00031572580337524414, + "lambda_div_used": 0.6021129563450813, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.0031, + "reward": -0.24226298835128546, + "reward_after_mean": -0.24226298835128546, + "reward_after_std": 0.5046281572431326, + "reward_before_mean": 0.02957908995449543, + "reward_before_std": 0.47462415788322687, + "reward_change_max": 0.0, + "reward_change_mean": -0.2718420699238777, + "reward_change_min": -0.45962032303214073, + "reward_change_std": 0.1774886343628168, + "reward_std": 0.5046281665563583, + "rewards/accuracy_reward": 0.14583333767950535, + "rewards/cosine_scaled_reward": -0.11625425447709858, + "step": 481 + }, + { + "clip_fraction": 0.0, + "completion_length": 2584.3958702087402, + "epoch": 0.5508571428571428, + "grad_norm": 0.027141164988279343, + "kl": 0.0004488229751586914, + "lambda_div_used": 0.6000719964504242, + "learning_rate": 1.0395300688680625e-07, + "loss": -0.0155, + "reward": 0.2671008687466383, + "reward_after_mean": 0.2671008687466383, + "reward_after_std": 0.6417696066200733, + "reward_before_mean": 0.8570855539292097, + "reward_before_std": 0.46050422452390194, + "reward_change_max": 0.0, + "reward_change_mean": -0.5899846386164427, + "reward_change_min": -0.8298100866377354, + "reward_change_std": 0.3227744400501251, + "reward_std": 0.6417696103453636, + "rewards/accuracy_reward": 0.5625000055879354, + "rewards/cosine_scaled_reward": 0.29458553344011307, + "step": 482 + }, + { + "clip_fraction": 0.0, + "completion_length": 2577.833366394043, + "epoch": 0.552, + "grad_norm": 0.022445959970355034, + "kl": 0.00040972232818603516, + "lambda_div_used": 0.5583987012505531, + "learning_rate": 1.0354838440848501e-07, + "loss": -0.0221, + "reward": -0.2919683400541544, + "reward_after_mean": -0.2919683400541544, + "reward_after_std": 0.37157695554196835, + "reward_before_mean": 0.06092929560691118, + "reward_before_std": 0.26688239723443985, + "reward_change_max": 0.0, + "reward_change_mean": -0.35289763286709785, + "reward_change_min": -0.5401931628584862, + "reward_change_std": 0.19714731443673372, + "reward_std": 0.37157696671783924, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/cosine_scaled_reward": -0.10573737230151892, + "step": 483 + }, + { + "clip_fraction": 0.0, + "completion_length": 2279.06254196167, + "epoch": 0.5531428571428572, + "grad_norm": 0.027125045657157898, + "kl": 0.0002903938293457031, + "lambda_div_used": 0.6015297621488571, + "learning_rate": 1.0316552135205837e-07, + "loss": -0.0284, + "reward": 0.08303672191686928, + "reward_after_mean": 0.08303672191686928, + "reward_after_std": 0.5998833030462265, + "reward_before_mean": 0.5530420504510403, + "reward_before_std": 0.4744609510526061, + "reward_change_max": 0.0, + "reward_change_mean": -0.47000533528625965, + "reward_change_min": -0.6774297691881657, + "reward_change_std": 0.2718982184305787, + "reward_std": 0.5998833123594522, + "rewards/accuracy_reward": 0.41666666977107525, + "rewards/cosine_scaled_reward": 0.13637536205351353, + "step": 484 + }, + { + "clip_fraction": 0.0, + "completion_length": 1672.458351135254, + "epoch": 0.5542857142857143, + "grad_norm": 0.03484974429011345, + "kl": 0.00024706125259399414, + "lambda_div_used": 0.5606790855526924, + "learning_rate": 1.0280443637773163e-07, + "loss": -0.0009, + "reward": -0.24691498838365078, + "reward_after_mean": -0.24691498838365078, + "reward_after_std": 0.4085298776626587, + "reward_before_mean": 0.13868734147399664, + "reward_before_std": 0.2728640455752611, + "reward_change_max": 0.0, + "reward_change_mean": -0.38560234755277634, + "reward_change_min": -0.5570618100464344, + "reward_change_std": 0.20243172626942396, + "reward_std": 0.408529881387949, + "rewards/accuracy_reward": 0.18750000186264515, + "rewards/cosine_scaled_reward": -0.04881266225129366, + "step": 485 + }, + { + "clip_fraction": 0.0, + "completion_length": 1802.2083473205566, + "epoch": 0.5554285714285714, + "grad_norm": 0.03193674981594086, + "kl": 0.000291287899017334, + "lambda_div_used": 0.6145108714699745, + "learning_rate": 1.0246514708427701e-07, + "loss": 0.0763, + "reward": -0.15824460261501372, + "reward_after_mean": -0.15824460261501372, + "reward_after_std": 0.5667215548455715, + "reward_before_mean": 0.12157449871301651, + "reward_before_std": 0.5284877885133028, + "reward_change_max": 0.0, + "reward_change_mean": -0.2798191010951996, + "reward_change_min": -0.43175532296299934, + "reward_change_std": 0.17297773249447346, + "reward_std": 0.566721560433507, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/cosine_scaled_reward": -0.0867588329128921, + "step": 486 + }, + { + "clip_fraction": 0.0, + "completion_length": 1764.979190826416, + "epoch": 0.5565714285714286, + "grad_norm": 0.03538261726498604, + "kl": 0.0003072023391723633, + "lambda_div_used": 0.6353137269616127, + "learning_rate": 1.0214767000817596e-07, + "loss": 0.0034, + "reward": 0.4021994969807565, + "reward_after_mean": 0.4021994969807565, + "reward_after_std": 0.7458214424550533, + "reward_before_mean": 0.9773663654923439, + "reward_before_std": 0.6311542720068246, + "reward_change_max": 0.0, + "reward_change_mean": -0.5751669164747, + "reward_change_min": -0.8273528963327408, + "reward_change_std": 0.3425491387024522, + "reward_std": 0.745821475982666, + "rewards/accuracy_reward": 0.6041666772216558, + "rewards/cosine_scaled_reward": 0.37319972552359104, + "step": 487 + }, + { + "clip_fraction": 0.0, + "completion_length": 2063.020866394043, + "epoch": 0.5577142857142857, + "grad_norm": 0.027028201147913933, + "kl": 0.0002803802490234375, + "lambda_div_used": 0.5719931498169899, + "learning_rate": 1.0185202062281336e-07, + "loss": 0.0247, + "reward": -0.2594454251229763, + "reward_after_mean": -0.2594454251229763, + "reward_after_std": 0.458538630977273, + "reward_before_mean": 0.09588468819856644, + "reward_before_std": 0.3299330030567944, + "reward_change_max": 0.0, + "reward_change_mean": -0.35533010959625244, + "reward_change_min": -0.5170786269009113, + "reward_change_std": 0.19317798037081957, + "reward_std": 0.45853864029049873, + "rewards/accuracy_reward": 0.18750000186264515, + "rewards/cosine_scaled_reward": -0.09161530435085297, + "step": 488 + }, + { + "clip_fraction": 0.0, + "completion_length": 3084.8958892822266, + "epoch": 0.5588571428571428, + "grad_norm": 0.02722005918622017, + "kl": 0.00041961669921875, + "lambda_div_used": 0.5843943357467651, + "learning_rate": 1.0157821333772304e-07, + "loss": -0.012, + "reward": -0.31660015136003494, + "reward_after_mean": -0.31660015136003494, + "reward_after_std": 0.43002712167799473, + "reward_before_mean": -0.0431265402585268, + "reward_before_std": 0.38921575900167227, + "reward_change_max": 0.0, + "reward_change_mean": -0.2734736017882824, + "reward_change_min": -0.4792550317943096, + "reward_change_std": 0.17569494806230068, + "reward_std": 0.4300271272659302, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/cosine_scaled_reward": -0.14729320164769888, + "step": 489 + }, + { + "clip_fraction": 0.0, + "completion_length": 2155.020851135254, + "epoch": 0.56, + "grad_norm": 0.024445833638310432, + "kl": 0.00034758448600769043, + "lambda_div_used": 0.5682637020945549, + "learning_rate": 1.013262614978859e-07, + "loss": -0.0323, + "reward": -0.13516036188229918, + "reward_after_mean": -0.13516036188229918, + "reward_after_std": 0.41841856203973293, + "reward_before_mean": 0.28103313967585564, + "reward_before_std": 0.307465685531497, + "reward_change_max": 0.0, + "reward_change_mean": -0.41619347035884857, + "reward_change_min": -0.5914728902280331, + "reward_change_std": 0.22987399622797966, + "reward_std": 0.4184185788035393, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/cosine_scaled_reward": -0.01063353568315506, + "step": 490 + }, + { + "clip_fraction": 0.0, + "completion_length": 2476.8958740234375, + "epoch": 0.5611428571428572, + "grad_norm": 0.026141280308365822, + "kl": 0.0003236532211303711, + "lambda_div_used": 0.61311075091362, + "learning_rate": 1.0109617738307911e-07, + "loss": -0.0399, + "reward": 0.14587094401940703, + "reward_after_mean": 0.14587094401940703, + "reward_after_std": 0.6566581912338734, + "reward_before_mean": 0.6461835531517863, + "reward_before_std": 0.5190226640552282, + "reward_change_max": 0.0, + "reward_change_mean": -0.500312577933073, + "reward_change_min": -0.7899926863610744, + "reward_change_std": 0.29301502648741007, + "reward_std": 0.6566582024097443, + "rewards/accuracy_reward": 0.45833334140479565, + "rewards/cosine_scaled_reward": 0.1878501633182168, + "step": 491 + }, + { + "clip_fraction": 0.0, + "completion_length": 2644.770835876465, + "epoch": 0.5622857142857143, + "grad_norm": 0.04309506341814995, + "kl": 0.000333636999130249, + "lambda_div_used": 0.5899444594979286, + "learning_rate": 1.0088797220727779e-07, + "loss": 0.012, + "reward": -0.1085394024848938, + "reward_after_mean": -0.1085394024848938, + "reward_after_std": 0.4626395758241415, + "reward_before_mean": 0.25940042175352573, + "reward_before_std": 0.4149730410426855, + "reward_change_max": 0.0, + "reward_change_mean": -0.3679398000240326, + "reward_change_min": -0.5678628534078598, + "reward_change_std": 0.223101656883955, + "reward_std": 0.46263957768678665, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/cosine_scaled_reward": -0.011432923376560211, + "step": 492 + }, + { + "clip_fraction": 0.0, + "completion_length": 1723.0000457763672, + "epoch": 0.5634285714285714, + "grad_norm": 0.029525646939873695, + "kl": 0.00028970837593078613, + "lambda_div_used": 0.6680872738361359, + "learning_rate": 1.0070165611810855e-07, + "loss": 0.0137, + "reward": 0.13114306051284075, + "reward_after_mean": 0.13114306051284075, + "reward_after_std": 0.8511836100369692, + "reward_before_mean": 0.4935412285849452, + "reward_before_std": 0.7856986094266176, + "reward_change_max": 0.0, + "reward_change_mean": -0.3623981699347496, + "reward_change_min": -0.587810892611742, + "reward_change_std": 0.22921003215014935, + "reward_std": 0.8511836417019367, + "rewards/accuracy_reward": 0.37500000558793545, + "rewards/cosine_scaled_reward": 0.11854122020304203, + "step": 493 + }, + { + "clip_fraction": 0.0, + "completion_length": 1625.3125915527344, + "epoch": 0.5645714285714286, + "grad_norm": 0.031883496791124344, + "kl": 0.00029462575912475586, + "lambda_div_used": 0.6554152071475983, + "learning_rate": 1.005372381963547e-07, + "loss": -0.0192, + "reward": 0.2886330671608448, + "reward_after_mean": 0.2886330671608448, + "reward_after_std": 0.7760931197553873, + "reward_before_mean": 0.7313874992541969, + "reward_before_std": 0.7224944466724992, + "reward_change_max": 0.0, + "reward_change_mean": -0.4427544269710779, + "reward_change_min": -0.7037210427224636, + "reward_change_std": 0.28225341718643904, + "reward_std": 0.776093129068613, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/cosine_scaled_reward": 0.2730541592463851, + "step": 494 + }, + { + "clip_fraction": 0.0, + "completion_length": 2404.104217529297, + "epoch": 0.5657142857142857, + "grad_norm": 0.025816943496465683, + "kl": 0.00029496103525161743, + "lambda_div_used": 0.6370417103171349, + "learning_rate": 1.0039472645551372e-07, + "loss": 0.0444, + "reward": -0.002785082906484604, + "reward_after_mean": -0.002785082906484604, + "reward_after_std": 0.7017532214522362, + "reward_before_mean": 0.35137681500054896, + "reward_before_std": 0.6334654297679663, + "reward_change_max": 0.0, + "reward_change_mean": -0.3541618827730417, + "reward_change_min": -0.61022624745965, + "reward_change_std": 0.2296114508062601, + "reward_std": 0.7017532512545586, + "rewards/accuracy_reward": 0.3333333395421505, + "rewards/cosine_scaled_reward": 0.018043467309325933, + "step": 495 + }, + { + "clip_fraction": 0.0, + "completion_length": 1731.9791717529297, + "epoch": 0.5668571428571428, + "grad_norm": 0.03966463729739189, + "kl": 0.00029587745666503906, + "lambda_div_used": 0.6256460249423981, + "learning_rate": 1.002741278414069e-07, + "loss": 0.0666, + "reward": 0.17603341676294804, + "reward_after_mean": 0.17603341676294804, + "reward_after_std": 0.636838972568512, + "reward_before_mean": 0.6388222957029939, + "reward_before_std": 0.586422567255795, + "reward_change_max": 0.0, + "reward_change_mean": -0.4627888761460781, + "reward_change_min": -0.7264026664197445, + "reward_change_std": 0.29389980621635914, + "reward_std": 0.6368389893323183, + "rewards/accuracy_reward": 0.4166666753590107, + "rewards/cosine_scaled_reward": 0.22215561103075743, + "step": 496 + }, + { + "clip_fraction": 0.0, + "completion_length": 2216.1250534057617, + "epoch": 0.568, + "grad_norm": 0.028931519016623497, + "kl": 0.0002690255641937256, + "lambda_div_used": 0.6521744430065155, + "learning_rate": 1.0017544823184055e-07, + "loss": 0.0104, + "reward": 0.271162249147892, + "reward_after_mean": 0.271162249147892, + "reward_after_std": 0.7392647787928581, + "reward_before_mean": 0.7059557363390923, + "reward_before_std": 0.7143486840650439, + "reward_change_max": 0.0, + "reward_change_mean": -0.43479350954294205, + "reward_change_min": -0.7290924154222012, + "reward_change_std": 0.2943303110077977, + "reward_std": 0.7392647992819548, + "rewards/accuracy_reward": 0.4791666753590107, + "rewards/cosine_scaled_reward": 0.2267890479415655, + "step": 497 + }, + { + "clip_fraction": 0.0, + "completion_length": 2327.6250534057617, + "epoch": 0.5691428571428572, + "grad_norm": 0.02442491240799427, + "kl": 0.0003362894058227539, + "lambda_div_used": 0.659889928996563, + "learning_rate": 1.0009869243631952e-07, + "loss": 0.0113, + "reward": 0.31018914096057415, + "reward_after_mean": 0.31018914096057415, + "reward_after_std": 0.7232479602098465, + "reward_before_mean": 0.7269191518425941, + "reward_before_std": 0.741204846650362, + "reward_change_max": 0.0, + "reward_change_mean": -0.41672998666763306, + "reward_change_min": -0.7166927941143513, + "reward_change_std": 0.2858916409313679, + "reward_std": 0.7232479825615883, + "rewards/accuracy_reward": 0.5000000111758709, + "rewards/cosine_scaled_reward": 0.22691912204027176, + "step": 498 + }, + { + "clip_fraction": 0.0, + "completion_length": 2263.500026702881, + "epoch": 0.5702857142857143, + "grad_norm": 0.024736450985074043, + "kl": 0.0002918243408203125, + "lambda_div_used": 0.60347481071949, + "learning_rate": 1.000438641958131e-07, + "loss": -0.0248, + "reward": 0.009699596092104912, + "reward_after_mean": 0.009699596092104912, + "reward_after_std": 0.5504223238676786, + "reward_before_mean": 0.4187678713351488, + "reward_before_std": 0.483045837841928, + "reward_change_max": 0.0, + "reward_change_mean": -0.4090682379901409, + "reward_change_min": -0.6463241390883923, + "reward_change_std": 0.2534347465261817, + "reward_std": 0.5504223313182592, + "rewards/accuracy_reward": 0.3333333395421505, + "rewards/cosine_scaled_reward": 0.08543450571596622, + "step": 499 + }, + { + "clip_fraction": 0.0, + "completion_length": 2641.000030517578, + "epoch": 0.5714285714285714, + "grad_norm": 0.02830589935183525, + "kl": 0.00043022632598876953, + "lambda_div_used": 0.6449039503931999, + "learning_rate": 1.0001096618257236e-07, + "loss": 0.0892, + "reward": -0.016204694285988808, + "reward_after_mean": -0.016204694285988808, + "reward_after_std": 0.6894690785557032, + "reward_before_mean": 0.2851157810073346, + "reward_before_std": 0.6750934664160013, + "reward_change_max": 0.0, + "reward_change_mean": -0.3013204652816057, + "reward_change_min": -0.5549999382346869, + "reward_change_std": 0.20787928439676762, + "reward_std": 0.6894691102206707, + "rewards/accuracy_reward": 0.29166667722165585, + "rewards/cosine_scaled_reward": -0.006550896912813187, + "step": 500 + }, + { + "epoch": 0.5714285714285714, + "step": 500, + "total_flos": 0.0, + "train_loss": 0.0046342451156378955, + "train_runtime": 100887.1622, + "train_samples_per_second": 0.238, + "train_steps_per_second": 0.005 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..6392141 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b630f1da3b8a9f5dc75e704d29f8dbf0464a2b9b7c42e6843ec187c68bc4ed7 +size 8824