commit d889a5ea5a4d6fb9d48bfbe72b84b234ceeeab33 Author: ModelHub XC Date: Fri May 22 11:25:17 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: kangdawei/DRA-DR_GRPO Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..8f617cb --- /dev/null +++ b/.gitattributes @@ -0,0 +1,37 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +reward_data/all_rewards.csv filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..9815135 --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +datasets: knoveleng/open-rs +library_name: transformers +model_name: DRA-DR_GRPO +tags: +- generated_from_trainer +- open-r1 +- trl +- grpo +licence: license +--- + +# Model Card for DRA-DR_GRPO + +This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) on the [knoveleng/open-rs](https://huggingface.co/datasets/knoveleng/open-rs) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="kangdawei/DRA-DR_GRPO", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + + + + +This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300). + +### Framework versions + +- TRL: 0.16.0.dev0 +- Transformers: 4.49.0 +- Pytorch: 2.5.1 +- Datasets: 3.2.0 +- Tokenizers: 0.21.4 + +## Citations + +Cite GRPO as: + +```bibtex +@article{zhihong2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, +} + +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..8841ac2 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": 0.007517670260873274, + "train_runtime": 53153.3665, + "train_samples": 7000, + "train_samples_per_second": 0.452, + "train_steps_per_second": 0.009 +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..7052064 --- /dev/null +++ b/config.json @@ -0,0 +1,30 @@ +{ + "_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": 4096, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.49.0", + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..01dfe4b --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 151646, + "do_sample": true, + "eos_token_id": 151643, + "temperature": 0.6, + "top_p": 0.95, + "transformers_version": "4.49.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..6941207 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16f73cfc66a99238305bf353cd9f65c420f6cd566317a24207bfd5ffbde50ea5 +size 3554214752 diff --git a/reward_data/all_rewards.csv b/reward_data/all_rewards.csv new file mode 100644 index 0000000..7176cb1 --- /dev/null +++ b/reward_data/all_rewards.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32529ce43df6cc0e756e4077d486d734350df38aab9c72c6a74aac86c9f6548f +size 78888349 diff --git a/reward_plots/advantage_plot_step_0.png b/reward_plots/advantage_plot_step_0.png new file mode 100644 index 0000000..ac6073c Binary files /dev/null and b/reward_plots/advantage_plot_step_0.png differ diff --git a/reward_plots/advantage_plot_step_10.png b/reward_plots/advantage_plot_step_10.png new file mode 100644 index 0000000..9238e07 Binary files /dev/null and b/reward_plots/advantage_plot_step_10.png differ diff --git a/reward_plots/advantage_plot_step_100.png b/reward_plots/advantage_plot_step_100.png new file mode 100644 index 0000000..2f9f97e Binary files /dev/null and b/reward_plots/advantage_plot_step_100.png differ diff --git a/reward_plots/advantage_plot_step_110.png b/reward_plots/advantage_plot_step_110.png new file mode 100644 index 0000000..b7ee09f Binary files /dev/null and b/reward_plots/advantage_plot_step_110.png differ diff --git a/reward_plots/advantage_plot_step_120.png b/reward_plots/advantage_plot_step_120.png new file mode 100644 index 0000000..89313b4 Binary files /dev/null and b/reward_plots/advantage_plot_step_120.png differ diff --git a/reward_plots/advantage_plot_step_130.png b/reward_plots/advantage_plot_step_130.png new file mode 100644 index 0000000..b070661 Binary files /dev/null and b/reward_plots/advantage_plot_step_130.png differ diff --git a/reward_plots/advantage_plot_step_140.png b/reward_plots/advantage_plot_step_140.png new file mode 100644 index 0000000..aa5603f Binary files /dev/null and b/reward_plots/advantage_plot_step_140.png differ diff --git a/reward_plots/advantage_plot_step_150.png b/reward_plots/advantage_plot_step_150.png new file mode 100644 index 0000000..3386c0d Binary files /dev/null and b/reward_plots/advantage_plot_step_150.png differ diff --git a/reward_plots/advantage_plot_step_160.png b/reward_plots/advantage_plot_step_160.png new file mode 100644 index 0000000..acdc63f Binary files /dev/null and b/reward_plots/advantage_plot_step_160.png differ diff --git a/reward_plots/advantage_plot_step_170.png b/reward_plots/advantage_plot_step_170.png new file mode 100644 index 0000000..490fb99 Binary files /dev/null and b/reward_plots/advantage_plot_step_170.png differ diff --git a/reward_plots/advantage_plot_step_180.png b/reward_plots/advantage_plot_step_180.png new file mode 100644 index 0000000..cde35e3 Binary files /dev/null and b/reward_plots/advantage_plot_step_180.png differ diff --git a/reward_plots/advantage_plot_step_190.png b/reward_plots/advantage_plot_step_190.png new file mode 100644 index 0000000..239b1d0 Binary files /dev/null and b/reward_plots/advantage_plot_step_190.png differ diff --git a/reward_plots/advantage_plot_step_20.png b/reward_plots/advantage_plot_step_20.png new file mode 100644 index 0000000..03d21d1 Binary files /dev/null and b/reward_plots/advantage_plot_step_20.png differ diff --git a/reward_plots/advantage_plot_step_200.png b/reward_plots/advantage_plot_step_200.png new file mode 100644 index 0000000..1045cfe Binary files /dev/null and b/reward_plots/advantage_plot_step_200.png differ diff --git a/reward_plots/advantage_plot_step_210.png b/reward_plots/advantage_plot_step_210.png new file mode 100644 index 0000000..0991364 Binary files /dev/null and b/reward_plots/advantage_plot_step_210.png differ diff --git a/reward_plots/advantage_plot_step_220.png b/reward_plots/advantage_plot_step_220.png new file mode 100644 index 0000000..3cbf5bf Binary files /dev/null and b/reward_plots/advantage_plot_step_220.png differ diff --git a/reward_plots/advantage_plot_step_230.png b/reward_plots/advantage_plot_step_230.png new file mode 100644 index 0000000..80872a5 Binary files /dev/null and b/reward_plots/advantage_plot_step_230.png differ diff --git a/reward_plots/advantage_plot_step_240.png b/reward_plots/advantage_plot_step_240.png new file mode 100644 index 0000000..cee05e6 Binary files /dev/null and b/reward_plots/advantage_plot_step_240.png differ diff --git a/reward_plots/advantage_plot_step_250.png b/reward_plots/advantage_plot_step_250.png new file mode 100644 index 0000000..e9088f2 Binary files /dev/null and b/reward_plots/advantage_plot_step_250.png differ diff --git a/reward_plots/advantage_plot_step_260.png b/reward_plots/advantage_plot_step_260.png new file mode 100644 index 0000000..a1ab8eb Binary files /dev/null and b/reward_plots/advantage_plot_step_260.png differ diff --git a/reward_plots/advantage_plot_step_270.png b/reward_plots/advantage_plot_step_270.png new file mode 100644 index 0000000..2a2940c Binary files /dev/null and b/reward_plots/advantage_plot_step_270.png differ diff --git a/reward_plots/advantage_plot_step_280.png b/reward_plots/advantage_plot_step_280.png new file mode 100644 index 0000000..b85c7bc Binary files /dev/null and b/reward_plots/advantage_plot_step_280.png differ diff --git a/reward_plots/advantage_plot_step_290.png b/reward_plots/advantage_plot_step_290.png new file mode 100644 index 0000000..a7ba313 Binary files /dev/null and b/reward_plots/advantage_plot_step_290.png differ diff --git a/reward_plots/advantage_plot_step_30.png b/reward_plots/advantage_plot_step_30.png new file mode 100644 index 0000000..2c60fa1 Binary files /dev/null and b/reward_plots/advantage_plot_step_30.png differ diff --git a/reward_plots/advantage_plot_step_300.png b/reward_plots/advantage_plot_step_300.png new file mode 100644 index 0000000..7c66f12 Binary files /dev/null and b/reward_plots/advantage_plot_step_300.png differ diff --git a/reward_plots/advantage_plot_step_310.png b/reward_plots/advantage_plot_step_310.png new file mode 100644 index 0000000..6c5d0e7 Binary files /dev/null and b/reward_plots/advantage_plot_step_310.png differ diff --git a/reward_plots/advantage_plot_step_320.png b/reward_plots/advantage_plot_step_320.png new file mode 100644 index 0000000..ddb6e29 Binary files /dev/null and b/reward_plots/advantage_plot_step_320.png differ diff --git a/reward_plots/advantage_plot_step_330.png b/reward_plots/advantage_plot_step_330.png new file mode 100644 index 0000000..7ed1029 Binary files /dev/null and b/reward_plots/advantage_plot_step_330.png differ diff --git a/reward_plots/advantage_plot_step_340.png b/reward_plots/advantage_plot_step_340.png new file mode 100644 index 0000000..fcfdee9 Binary files /dev/null and b/reward_plots/advantage_plot_step_340.png differ diff --git a/reward_plots/advantage_plot_step_350.png b/reward_plots/advantage_plot_step_350.png new file mode 100644 index 0000000..eb48745 Binary files /dev/null and b/reward_plots/advantage_plot_step_350.png differ diff --git a/reward_plots/advantage_plot_step_360.png b/reward_plots/advantage_plot_step_360.png new file mode 100644 index 0000000..1ce1372 Binary files /dev/null and b/reward_plots/advantage_plot_step_360.png differ diff --git a/reward_plots/advantage_plot_step_370.png b/reward_plots/advantage_plot_step_370.png new file mode 100644 index 0000000..6ef35a4 Binary files /dev/null and b/reward_plots/advantage_plot_step_370.png differ diff --git a/reward_plots/advantage_plot_step_380.png b/reward_plots/advantage_plot_step_380.png new file mode 100644 index 0000000..19b26d5 Binary files /dev/null and b/reward_plots/advantage_plot_step_380.png differ diff --git a/reward_plots/advantage_plot_step_390.png b/reward_plots/advantage_plot_step_390.png new file mode 100644 index 0000000..83ca17f Binary files /dev/null and b/reward_plots/advantage_plot_step_390.png differ diff --git a/reward_plots/advantage_plot_step_40.png b/reward_plots/advantage_plot_step_40.png new file mode 100644 index 0000000..a601896 Binary files /dev/null and b/reward_plots/advantage_plot_step_40.png differ diff --git a/reward_plots/advantage_plot_step_400.png b/reward_plots/advantage_plot_step_400.png new file mode 100644 index 0000000..116ea37 Binary files /dev/null and b/reward_plots/advantage_plot_step_400.png differ diff --git a/reward_plots/advantage_plot_step_410.png b/reward_plots/advantage_plot_step_410.png new file mode 100644 index 0000000..c18f23c Binary files /dev/null and b/reward_plots/advantage_plot_step_410.png differ diff --git a/reward_plots/advantage_plot_step_420.png b/reward_plots/advantage_plot_step_420.png new file mode 100644 index 0000000..45622b2 Binary files /dev/null and b/reward_plots/advantage_plot_step_420.png differ diff --git a/reward_plots/advantage_plot_step_430.png b/reward_plots/advantage_plot_step_430.png new file mode 100644 index 0000000..9ebad74 Binary files /dev/null and b/reward_plots/advantage_plot_step_430.png differ diff --git a/reward_plots/advantage_plot_step_440.png b/reward_plots/advantage_plot_step_440.png new file mode 100644 index 0000000..ee18e14 Binary files /dev/null and b/reward_plots/advantage_plot_step_440.png differ diff --git a/reward_plots/advantage_plot_step_450.png b/reward_plots/advantage_plot_step_450.png new file mode 100644 index 0000000..2a0d911 Binary files /dev/null and b/reward_plots/advantage_plot_step_450.png differ diff --git a/reward_plots/advantage_plot_step_460.png b/reward_plots/advantage_plot_step_460.png new file mode 100644 index 0000000..8e879d7 Binary files /dev/null and b/reward_plots/advantage_plot_step_460.png differ diff --git a/reward_plots/advantage_plot_step_470.png b/reward_plots/advantage_plot_step_470.png new file mode 100644 index 0000000..b9b9cda Binary files /dev/null and b/reward_plots/advantage_plot_step_470.png differ diff --git a/reward_plots/advantage_plot_step_480.png b/reward_plots/advantage_plot_step_480.png new file mode 100644 index 0000000..faa0114 Binary files /dev/null and b/reward_plots/advantage_plot_step_480.png differ diff --git a/reward_plots/advantage_plot_step_490.png b/reward_plots/advantage_plot_step_490.png new file mode 100644 index 0000000..dceec26 Binary files /dev/null and b/reward_plots/advantage_plot_step_490.png differ diff --git a/reward_plots/advantage_plot_step_50.png b/reward_plots/advantage_plot_step_50.png new file mode 100644 index 0000000..9d4c702 Binary files /dev/null and b/reward_plots/advantage_plot_step_50.png differ diff --git a/reward_plots/advantage_plot_step_60.png b/reward_plots/advantage_plot_step_60.png new file mode 100644 index 0000000..e1cb9ce Binary files /dev/null and b/reward_plots/advantage_plot_step_60.png differ diff --git a/reward_plots/advantage_plot_step_70.png b/reward_plots/advantage_plot_step_70.png new file mode 100644 index 0000000..822c8f2 Binary files /dev/null and b/reward_plots/advantage_plot_step_70.png differ diff --git a/reward_plots/advantage_plot_step_80.png b/reward_plots/advantage_plot_step_80.png new file mode 100644 index 0000000..7c63143 Binary files /dev/null and b/reward_plots/advantage_plot_step_80.png differ diff --git a/reward_plots/advantage_plot_step_90.png b/reward_plots/advantage_plot_step_90.png new file mode 100644 index 0000000..ac4950f Binary files /dev/null and b/reward_plots/advantage_plot_step_90.png differ diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..1d385d6 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..e7cd2c1 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a +size 11422959 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..ef6e98c --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,195 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..8841ac2 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": 0.007517670260873274, + "train_runtime": 53153.3665, + "train_samples": 7000, + "train_samples_per_second": 0.452, + "train_steps_per_second": 0.009 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..291484f --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,9042 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5714285714285714, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "advantage_max": 0.1714239763095975, + "advantage_mean": 2.7163576388211652e-09, + "advantage_min": -0.18542360328137875, + "advantage_std": 0.14101680787280202, + "completion_length": 2571.2083587646484, + "epoch": 0.001142857142857143, + "grad_norm": 0.025767112150788307, + "kl": 0.0, + "learning_rate": 2e-08, + "loss": 0.0137, + "reward": 0.08349451050162315, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.14101681299507618, + "rewards/cosine_scaled_reward": -0.015534311532974243, + "rewards/format_reward": 0.5208333488553762, + "step": 1 + }, + { + "advantage_max": 0.07242919644340873, + "advantage_mean": 1.8626451769865326e-09, + "advantage_min": -0.09870566707104445, + "advantage_std": 0.071280462667346, + "completion_length": 2804.395881652832, + "epoch": 0.002285714285714286, + "grad_norm": 0.010948998853564262, + "kl": 0.0, + "learning_rate": 4e-08, + "loss": 0.0044, + "reward": 0.04647743375971913, + "reward_advantage_correlation": 1.0, + "reward_std": 0.071280462667346, + "rewards/cosine_scaled_reward": -0.04980122856795788, + "rewards/format_reward": 0.37500000558793545, + "step": 2 + }, + { + "advantage_max": 0.10077127907425165, + "advantage_mean": 3.880513965714982e-11, + "advantage_min": -0.07837366871535778, + "advantage_std": 0.07264299970120192, + "completion_length": 3330.7291870117188, + "epoch": 0.0034285714285714284, + "grad_norm": 0.011240585707128048, + "kl": 4.692375659942627e-05, + "learning_rate": 6e-08, + "loss": -0.0004, + "reward": -0.05792155209928751, + "reward_advantage_correlation": 1.0, + "reward_std": 0.07264300249516964, + "rewards/cosine_scaled_reward": -0.24313471233472228, + "rewards/format_reward": 0.14583333395421505, + "step": 3 + }, + { + "advantage_max": 0.1539376201108098, + "advantage_mean": -1.396983917434369e-09, + "advantage_min": -0.13278733659535646, + "advantage_std": 0.11549357417970896, + "completion_length": 2221.6875228881836, + "epoch": 0.004571428571428572, + "grad_norm": 0.021713746711611748, + "kl": 4.139542579650879e-05, + "learning_rate": 8e-08, + "loss": -0.0008, + "reward": 0.07605884410440922, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11549357557669282, + "rewards/cosine_scaled_reward": -0.10020758584141731, + "rewards/format_reward": 0.6458333358168602, + "step": 4 + }, + { + "advantage_max": 0.18950440920889378, + "advantage_mean": 2.561137149581505e-09, + "advantage_min": -0.1065681865438819, + "advantage_std": 0.1075586169026792, + "completion_length": 3417.7291870117188, + "epoch": 0.005714285714285714, + "grad_norm": 0.01573144644498825, + "kl": 4.338473081588745e-05, + "learning_rate": 1e-07, + "loss": 0.0028, + "reward": -0.03302042291034013, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10755862621590495, + "rewards/cosine_scaled_reward": -0.20160695351660252, + "rewards/format_reward": 0.2083333395421505, + "step": 5 + }, + { + "advantage_max": 0.11689717648550868, + "advantage_mean": -1.7850348837944452e-09, + "advantage_min": -0.08994922507554293, + "advantage_std": 0.08724205708131194, + "completion_length": 2931.1458892822266, + "epoch": 0.006857142857142857, + "grad_norm": 0.021110277622938156, + "kl": 3.407290205359459e-05, + "learning_rate": 1.2e-07, + "loss": 0.0099, + "reward": -0.009482193738222122, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.08724205940961838, + "rewards/cosine_scaled_reward": -0.204607228981331, + "rewards/format_reward": 0.35416667349636555, + "step": 6 + }, + { + "advantage_max": 0.11794257629662752, + "advantage_mean": -3.1820189400066923e-09, + "advantage_min": -0.1437535872682929, + "advantage_std": 0.10767027572728693, + "completion_length": 3049.3959045410156, + "epoch": 0.008, + "grad_norm": 0.020024148747324944, + "kl": 2.3268163204193115e-05, + "learning_rate": 1.4e-07, + "loss": 0.0037, + "reward": 0.11388289113529027, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.10767027852125466, + "rewards/cosine_scaled_reward": 0.003962432965636253, + "rewards/format_reward": 0.6666666828095913, + "step": 7 + }, + { + "advantage_max": 0.20535230357199907, + "advantage_mean": -1.7074246877468724e-09, + "advantage_min": -0.16674288269132376, + "advantage_std": 0.14301629923284054, + "completion_length": 2752.0625, + "epoch": 0.009142857142857144, + "grad_norm": 0.025196732953190804, + "kl": 2.1200627088546753e-05, + "learning_rate": 1.6e-07, + "loss": 0.0065, + "reward": 0.1058007568353787, + "reward_advantage_correlation": 1.0, + "reward_std": 0.14301631227135658, + "rewards/cosine_scaled_reward": 0.06942666228860617, + "rewards/format_reward": 0.4791666753590107, + "step": 8 + }, + { + "advantage_max": 0.17053810507059097, + "advantage_mean": -9.701276934559466e-10, + "advantage_min": -0.12266492750495672, + "advantage_std": 0.12067469954490662, + "completion_length": 3259.8125610351562, + "epoch": 0.010285714285714285, + "grad_norm": 0.02746347151696682, + "kl": 4.096329212188721e-05, + "learning_rate": 1.8e-07, + "loss": 0.0086, + "reward": 0.006678506499156356, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.12067470327019691, + "rewards/cosine_scaled_reward": -0.11600704118609428, + "rewards/format_reward": 0.2708333395421505, + "step": 9 + }, + { + "advantage_max": 0.1689818874001503, + "advantage_mean": -3.2596291221764773e-09, + "advantage_min": -0.1334312935359776, + "advantage_std": 0.11464598076418042, + "completion_length": 2768.8542098999023, + "epoch": 0.011428571428571429, + "grad_norm": 0.019988562911748886, + "kl": 3.0182301998138428e-05, + "learning_rate": 2e-07, + "loss": 0.0019, + "reward": 0.03028559315134771, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11464598076418042, + "rewards/cosine_scaled_reward": -0.10943621303886175, + "rewards/format_reward": 0.3958333358168602, + "step": 10 + }, + { + "advantage_max": 0.07887471001595259, + "advantage_mean": 1.24176348370586e-09, + "advantage_min": -0.07153345271945, + "advantage_std": 0.06311596930027008, + "completion_length": 3333.9166717529297, + "epoch": 0.012571428571428572, + "grad_norm": 0.009001471102237701, + "kl": 3.399699926376343e-05, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0012, + "reward": -0.07426449563354254, + "reward_advantage_correlation": 1.0, + "reward_std": 0.06311597069725394, + "rewards/cosine_scaled_reward": -0.27152102813124657, + "rewards/format_reward": 0.1041666716337204, + "step": 11 + }, + { + "advantage_max": 0.1358098853379488, + "advantage_mean": 1.2417634559502844e-09, + "advantage_min": -0.145633140578866, + "advantage_std": 0.11910986108705401, + "completion_length": 2601.395896911621, + "epoch": 0.013714285714285714, + "grad_norm": 0.016579382121562958, + "kl": 3.965198993682861e-05, + "learning_rate": 2.4e-07, + "loss": 0.0048, + "reward": 0.05325879342854023, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1191098652780056, + "rewards/cosine_scaled_reward": -0.14510760456323624, + "rewards/format_reward": 0.6041666753590107, + "step": 12 + }, + { + "advantage_max": 0.11820426164194942, + "advantage_mean": -4.656612873077393e-10, + "advantage_min": -0.10757657652720809, + "advantage_std": 0.09022333845496178, + "completion_length": 2989.5208740234375, + "epoch": 0.014857142857142857, + "grad_norm": 0.02005729451775551, + "kl": 3.2998621463775635e-05, + "learning_rate": 2.6e-07, + "loss": 0.008, + "reward": 0.0009057910647243261, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.09022334171459079, + "rewards/cosine_scaled_reward": -0.1317348200827837, + "rewards/format_reward": 0.2708333358168602, + "step": 13 + }, + { + "advantage_max": 0.14062324352562428, + "advantage_mean": 3.1044090909038147e-10, + "advantage_min": -0.16110873501747847, + "advantage_std": 0.11608104594051838, + "completion_length": 2749.0208587646484, + "epoch": 0.016, + "grad_norm": 0.017504651099443436, + "kl": 2.360716462135315e-05, + "learning_rate": 2.8e-07, + "loss": 0.0057, + "reward": 0.05834482208592817, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11608104268088937, + "rewards/cosine_scaled_reward": -0.06565599981695414, + "rewards/format_reward": 0.4791666753590107, + "step": 14 + }, + { + "advantage_max": 0.13011277560144663, + "advantage_mean": -1.940255407728575e-09, + "advantage_min": -0.10297658666968346, + "advantage_std": 0.0875613666139543, + "completion_length": 2769.208366394043, + "epoch": 0.017142857142857144, + "grad_norm": 0.010686655528843403, + "kl": 1.9287224858999252e-05, + "learning_rate": 3e-07, + "loss": 0.0016, + "reward": 0.061318085878156126, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.08756137173622847, + "rewards/cosine_scaled_reward": -0.007162087596952915, + "rewards/format_reward": 0.3750000037252903, + "step": 15 + }, + { + "advantage_max": 0.13473028596490622, + "advantage_mean": 4.1133415354388525e-09, + "advantage_min": -0.077317263931036, + "advantage_std": 0.08469738904386759, + "completion_length": 3583.0833435058594, + "epoch": 0.018285714285714287, + "grad_norm": 0.0169773381203413, + "kl": 3.8251280784606934e-05, + "learning_rate": 3.2e-07, + "loss": 0.0001, + "reward": -0.07893023523502052, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.08469739044085145, + "rewards/cosine_scaled_reward": -0.24335206672549248, + "rewards/format_reward": 0.02083333395421505, + "step": 16 + }, + { + "advantage_max": 0.13479452300816774, + "advantage_mean": -3.065603576546394e-09, + "advantage_min": -0.15165873477235436, + "advantage_std": 0.12228226847946644, + "completion_length": 2357.625026702881, + "epoch": 0.019428571428571427, + "grad_norm": 0.01721261627972126, + "kl": 4.204362630844116e-05, + "learning_rate": 3.4000000000000003e-07, + "loss": -0.0014, + "reward": 0.10265706898644567, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12228227453306317, + "rewards/cosine_scaled_reward": 0.030887765809893608, + "rewards/format_reward": 0.5416666734963655, + "step": 17 + }, + { + "advantage_max": 0.12159187206998467, + "advantage_mean": -3.8805105656569694e-10, + "advantage_min": -0.12701823841780424, + "advantage_std": 0.09618540527299047, + "completion_length": 2885.3125, + "epoch": 0.02057142857142857, + "grad_norm": 0.02120651677250862, + "kl": 3.0279159545898438e-05, + "learning_rate": 3.6e-07, + "loss": 0.007, + "reward": 0.02408734685741365, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.09618540899828076, + "rewards/cosine_scaled_reward": -0.11675130156800151, + "rewards/format_reward": 0.37500000931322575, + "step": 18 + }, + { + "advantage_max": 0.2786301076412201, + "advantage_mean": -2.0178656801039807e-09, + "advantage_min": -0.17539117764681578, + "advantage_std": 0.1897407090291381, + "completion_length": 3042.479202270508, + "epoch": 0.021714285714285714, + "grad_norm": 0.0366692878305912, + "kl": 2.7358531951904297e-05, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0142, + "reward": 0.08209404302760959, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1897407118231058, + "rewards/cosine_scaled_reward": 0.05350984400138259, + "rewards/format_reward": 0.37500000186264515, + "step": 19 + }, + { + "advantage_max": 0.17369511630386114, + "advantage_mean": -3.7252902707063384e-09, + "advantage_min": -0.143904535099864, + "advantage_std": 0.12505553639493883, + "completion_length": 2488.437587738037, + "epoch": 0.022857142857142857, + "grad_norm": 0.026572400704026222, + "kl": 1.1764466762542725e-05, + "learning_rate": 4e-07, + "loss": 0.0135, + "reward": 0.09353481137077324, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12505554361268878, + "rewards/cosine_scaled_reward": -0.01666953694075346, + "rewards/format_reward": 0.5833333395421505, + "step": 20 + }, + { + "advantage_max": 0.15747881215065718, + "advantage_mean": -3.1044090909038147e-10, + "advantage_min": -0.09395024552941322, + "advantage_std": 0.09924266301095486, + "completion_length": 2713.125015258789, + "epoch": 0.024, + "grad_norm": 0.016476722434163094, + "kl": 3.544241189956665e-05, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0059, + "reward": 0.06582744396291673, + "reward_advantage_correlation": 1.0, + "reward_std": 0.0992426648736, + "rewards/cosine_scaled_reward": -0.02483982127159834, + "rewards/format_reward": 0.43750000558793545, + "step": 21 + }, + { + "advantage_max": 0.1536049460992217, + "advantage_mean": -5.82076628563577e-09, + "advantage_min": -0.19146334286779165, + "advantage_std": 0.13952347543090582, + "completion_length": 1874.5417251586914, + "epoch": 0.025142857142857144, + "grad_norm": 0.02956153266131878, + "kl": 2.4802982807159424e-05, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0068, + "reward": 0.12891051033511758, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1395234796218574, + "rewards/cosine_scaled_reward": -0.014458773657679558, + "rewards/format_reward": 0.7916666772216558, + "step": 22 + }, + { + "advantage_max": 0.23872516956180334, + "advantage_mean": 1.3969838758010056e-09, + "advantage_min": -0.15435245260596275, + "advantage_std": 0.15457574743777514, + "completion_length": 2569.3125610351562, + "epoch": 0.026285714285714287, + "grad_norm": 0.026004575192928314, + "kl": 3.0472874641418457e-05, + "learning_rate": 4.6e-07, + "loss": 0.0088, + "reward": 0.06847280421061441, + "reward_advantage_correlation": 1.0, + "reward_std": 0.15457575675100088, + "rewards/cosine_scaled_reward": -0.04925672709941864, + "rewards/format_reward": 0.5000000149011612, + "step": 23 + }, + { + "advantage_max": 0.18653128948062658, + "advantage_mean": -2.444721682037798e-09, + "advantage_min": -0.1594811975955963, + "advantage_std": 0.13956549763679504, + "completion_length": 2802.395881652832, + "epoch": 0.027428571428571427, + "grad_norm": 0.020463019609451294, + "kl": 2.32793390750885e-05, + "learning_rate": 4.8e-07, + "loss": 0.0016, + "reward": 0.10617550695315003, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13956550369039178, + "rewards/cosine_scaled_reward": 0.03222842514514923, + "rewards/format_reward": 0.5625000186264515, + "step": 24 + }, + { + "advantage_max": 0.16575950756669044, + "advantage_mean": -1.7074248265247505e-09, + "advantage_min": -0.138729483820498, + "advantage_std": 0.1247619753703475, + "completion_length": 2796.5208740234375, + "epoch": 0.02857142857142857, + "grad_norm": 0.02838418260216713, + "kl": 3.729015588760376e-05, + "learning_rate": 5e-07, + "loss": 0.0122, + "reward": 0.032140296418219805, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12476197723299265, + "rewards/cosine_scaled_reward": -0.10717500746250153, + "rewards/format_reward": 0.39583334140479565, + "step": 25 + }, + { + "advantage_max": 0.1065903976559639, + "advantage_mean": 2.5611371079481415e-09, + "advantage_min": -0.12964865937829018, + "advantage_std": 0.096164018381387, + "completion_length": 3076.7708740234375, + "epoch": 0.029714285714285714, + "grad_norm": 0.023165522143244743, + "kl": 3.162771463394165e-05, + "learning_rate": 5.2e-07, + "loss": 0.0058, + "reward": 0.05766999162733555, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09616402024403214, + "rewards/cosine_scaled_reward": -0.03639649413526058, + "rewards/format_reward": 0.416666679084301, + "step": 26 + }, + { + "advantage_max": 0.2176275816746056, + "advantage_mean": -2.2700988727697435e-09, + "advantage_min": -0.14101197582203895, + "advantage_std": 0.14790143747814, + "completion_length": 3005.291702270508, + "epoch": 0.030857142857142857, + "grad_norm": 0.02873804420232773, + "kl": 2.8399168513715267e-05, + "learning_rate": 5.4e-07, + "loss": 0.0111, + "reward": 0.04077844490529969, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.14790144679136574, + "rewards/cosine_scaled_reward": -0.0776456345920451, + "rewards/format_reward": 0.39583333767950535, + "step": 27 + }, + { + "advantage_max": 0.15818555373698473, + "advantage_mean": -2.7551626674560126e-09, + "advantage_min": -0.17423970997333527, + "advantage_std": 0.13145049894228578, + "completion_length": 2832.729202270508, + "epoch": 0.032, + "grad_norm": 0.018948372453451157, + "kl": 3.7573277950286865e-05, + "learning_rate": 5.6e-07, + "loss": 0.0082, + "reward": 0.0805886962916702, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1314505017362535, + "rewards/cosine_scaled_reward": 0.016917362809181213, + "rewards/format_reward": 0.43750000558793545, + "step": 28 + }, + { + "advantage_max": 0.18313346058130264, + "advantage_mean": 3.1820188567399654e-09, + "advantage_min": -0.1069188816472888, + "advantage_std": 0.11597575852647424, + "completion_length": 3305.0208740234375, + "epoch": 0.03314285714285714, + "grad_norm": 0.03014214336872101, + "kl": 2.7902424335479736e-05, + "learning_rate": 5.8e-07, + "loss": 0.0136, + "reward": -0.044296178268268704, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.11597575340420008, + "rewards/cosine_scaled_reward": -0.22412699135020375, + "rewards/format_reward": 0.18750000558793545, + "step": 29 + }, + { + "advantage_max": 0.24942097766324878, + "advantage_mean": 4.3461721582760404e-09, + "advantage_min": -0.1541912415996194, + "advantage_std": 0.15846544690430164, + "completion_length": 2920.562545776367, + "epoch": 0.03428571428571429, + "grad_norm": 0.022144218906760216, + "kl": 2.2359192371368408e-05, + "learning_rate": 6e-07, + "loss": 0.0071, + "reward": 0.06114856945350766, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.15846544643864036, + "rewards/cosine_scaled_reward": -0.04798364010639489, + "rewards/format_reward": 0.4583333469927311, + "step": 30 + }, + { + "advantage_max": 0.1647213133983314, + "advantage_mean": 2.2506963098800625e-09, + "advantage_min": -0.11298016970977187, + "advantage_std": 0.11202249862253666, + "completion_length": 3126.5833587646484, + "epoch": 0.03542857142857143, + "grad_norm": 0.02341640554368496, + "kl": 1.6361474990844727e-05, + "learning_rate": 6.2e-07, + "loss": 0.0102, + "reward": -0.011873322539031506, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.11202250327914953, + "rewards/cosine_scaled_reward": -0.1413223911076784, + "rewards/format_reward": 0.2083333395421505, + "step": 31 + }, + { + "advantage_max": 0.1927646165713668, + "advantage_mean": -4.0357314226580066e-09, + "advantage_min": -0.1581784477457404, + "advantage_std": 0.12995850760489702, + "completion_length": 3253.125030517578, + "epoch": 0.036571428571428574, + "grad_norm": 0.016674285754561424, + "kl": 1.6938894987106323e-05, + "learning_rate": 6.4e-07, + "loss": 0.004, + "reward": 0.0447476077824831, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12995851668529212, + "rewards/cosine_scaled_reward": -0.0544386301189661, + "rewards/format_reward": 0.3750000149011612, + "step": 32 + }, + { + "advantage_max": 0.20177126210182905, + "advantage_mean": -1.241763414316921e-09, + "advantage_min": -0.18795710895210505, + "advantage_std": 0.15662608901038766, + "completion_length": 3411.541717529297, + "epoch": 0.037714285714285714, + "grad_norm": 0.02308499813079834, + "kl": 2.4718232452869415e-05, + "learning_rate": 6.6e-07, + "loss": 0.0056, + "reward": 0.04366421408485621, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.15662609227001667, + "rewards/cosine_scaled_reward": -0.026438521221280098, + "rewards/format_reward": 0.31250000558793545, + "step": 33 + }, + { + "advantage_max": 0.17956872167997062, + "advantage_mean": -3.104408619059029e-09, + "advantage_min": -0.14220268558710814, + "advantage_std": 0.12088308949023485, + "completion_length": 2531.812530517578, + "epoch": 0.038857142857142854, + "grad_norm": 0.014704632572829723, + "kl": 6.726384162902832e-05, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0015, + "reward": 0.09752498054876924, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12088309531100094, + "rewards/cosine_scaled_reward": 0.025213209679350257, + "rewards/format_reward": 0.520833333954215, + "step": 34 + }, + { + "advantage_max": 0.29936027619987726, + "advantage_mean": -5.551115123125783e-17, + "advantage_min": -0.172477250918746, + "advantage_std": 0.19010970601812005, + "completion_length": 2973.3750534057617, + "epoch": 0.04, + "grad_norm": 0.028409497812390327, + "kl": 5.359947681427002e-05, + "learning_rate": 7e-07, + "loss": 0.0123, + "reward": 0.04867997905239463, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.19010971300303936, + "rewards/cosine_scaled_reward": -0.024306990206241608, + "rewards/format_reward": 0.3333333395421505, + "step": 35 + }, + { + "advantage_max": 0.13590538362041116, + "advantage_mean": 9.313226370655237e-10, + "advantage_min": -0.09683632245287299, + "advantage_std": 0.08833803655579686, + "completion_length": 3288.0833740234375, + "epoch": 0.04114285714285714, + "grad_norm": 0.015868162736296654, + "kl": 7.23712146282196e-05, + "learning_rate": 7.2e-07, + "loss": 0.0045, + "reward": -0.03358328447211534, + "reward_advantage_correlation": 1.0, + "reward_std": 0.0883380388841033, + "rewards/cosine_scaled_reward": -0.22424227092415094, + "rewards/format_reward": 0.2500000074505806, + "step": 36 + }, + { + "advantage_max": 0.07873150100931525, + "advantage_mean": -3.1044085357923024e-10, + "advantage_min": -0.07795562036335468, + "advantage_std": 0.061489060055464506, + "completion_length": 3361.562530517578, + "epoch": 0.04228571428571429, + "grad_norm": 0.009638470597565174, + "kl": 2.0228326320648193e-05, + "learning_rate": 7.4e-07, + "loss": 0.0021, + "reward": -0.050579807022586465, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.06148906052112579, + "rewards/cosine_scaled_reward": -0.24296171963214874, + "rewards/format_reward": 0.1875, + "step": 37 + }, + { + "advantage_max": 0.1380753773264587, + "advantage_mean": -6.20881665525097e-10, + "advantage_min": -0.1019920501857996, + "advantage_std": 0.09536230750381947, + "completion_length": 3306.4583587646484, + "epoch": 0.04342857142857143, + "grad_norm": 0.015968551859259605, + "kl": 4.314631223678589e-05, + "learning_rate": 7.599999999999999e-07, + "loss": 0.002, + "reward": -0.031228411942720413, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09536230750381947, + "rewards/cosine_scaled_reward": -0.16523530799895525, + "rewards/format_reward": 0.14583333395421505, + "step": 38 + }, + { + "advantage_max": 0.1628796993754804, + "advantage_mean": -1.7074248126469627e-09, + "advantage_min": -0.1276350189000368, + "advantage_std": 0.1117043545236811, + "completion_length": 2918.7291984558105, + "epoch": 0.044571428571428574, + "grad_norm": 0.019875982776284218, + "kl": 6.305798888206482e-05, + "learning_rate": 7.799999999999999e-07, + "loss": 0.0022, + "reward": 0.05022500859195134, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.11170436086831614, + "rewards/cosine_scaled_reward": -0.05875556939281523, + "rewards/format_reward": 0.4166666679084301, + "step": 39 + }, + { + "advantage_max": 0.1351394895464182, + "advantage_mean": 2.1730860721991263e-09, + "advantage_min": -0.12486388254910707, + "advantage_std": 0.10662073362618685, + "completion_length": 2454.8958587646484, + "epoch": 0.045714285714285714, + "grad_norm": 0.022016212344169617, + "kl": 0.0002264752984046936, + "learning_rate": 8e-07, + "loss": 0.0066, + "reward": 0.06583994440734386, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10662073455750942, + "rewards/cosine_scaled_reward": -0.07870683167129755, + "rewards/format_reward": 0.541666679084301, + "step": 40 + }, + { + "advantage_max": 0.1448317738249898, + "advantage_mean": 1.3969838966176873e-09, + "advantage_min": -0.1136879026889801, + "advantage_std": 0.10412771673873067, + "completion_length": 3089.3333740234375, + "epoch": 0.046857142857142854, + "grad_norm": 0.01577775366604328, + "kl": 4.999339580535889e-05, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0071, + "reward": -0.007575191382784396, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.10412772092968225, + "rewards/cosine_scaled_reward": -0.20044021122157574, + "rewards/format_reward": 0.3541666716337204, + "step": 41 + }, + { + "advantage_max": 0.1294115127529949, + "advantage_mean": 8.149072944219071e-10, + "advantage_min": -0.08364611677825451, + "advantage_std": 0.08207520749419928, + "completion_length": 2773.3333702087402, + "epoch": 0.048, + "grad_norm": 0.014152735471725464, + "kl": 0.00012195669114589691, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0055, + "reward": -0.024103335803374648, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.08207521075382829, + "rewards/cosine_scaled_reward": -0.23803024366497993, + "rewards/format_reward": 0.33333333395421505, + "step": 42 + }, + { + "advantage_max": 0.1437476323917508, + "advantage_mean": 1.5522043927962415e-09, + "advantage_min": -0.11569147277623415, + "advantage_std": 0.09973477618768811, + "completion_length": 3171.854202270508, + "epoch": 0.04914285714285714, + "grad_norm": 0.014957522042095661, + "kl": 4.2708590626716614e-05, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0047, + "reward": -0.022214435506612062, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09973477479070425, + "rewards/cosine_scaled_reward": -0.16881779208779335, + "rewards/format_reward": 0.2083333358168602, + "step": 43 + }, + { + "advantage_max": 0.17799750808626413, + "advantage_mean": -1.7074247293802358e-09, + "advantage_min": -0.16484235506504774, + "advantage_std": 0.14439343940466642, + "completion_length": 2845.7708892822266, + "epoch": 0.05028571428571429, + "grad_norm": 0.0470295213162899, + "kl": 0.00029357708990573883, + "learning_rate": 8.799999999999999e-07, + "loss": 0.023, + "reward": 0.0359237277880311, + "reward_advantage_correlation": 1.0, + "reward_std": 0.14439343893900514, + "rewards/cosine_scaled_reward": -0.05193283036351204, + "rewards/format_reward": 0.31250000931322575, + "step": 44 + }, + { + "advantage_max": 0.19965059403330088, + "advantage_mean": -8.537123091789667e-10, + "advantage_min": -0.1259068874642253, + "advantage_std": 0.1281011113896966, + "completion_length": 3430.0833740234375, + "epoch": 0.05142857142857143, + "grad_norm": 0.018138015642762184, + "kl": 6.644893437623978e-05, + "learning_rate": 9e-07, + "loss": 0.0027, + "reward": 0.02113422704860568, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12810111325234175, + "rewards/cosine_scaled_reward": -0.06111693615093827, + "rewards/format_reward": 0.2500000074505806, + "step": 45 + }, + { + "advantage_max": 0.11925685312598944, + "advantage_mean": 1.2417634420724966e-09, + "advantage_min": -0.06791578326374292, + "advantage_std": 0.07461203960701823, + "completion_length": 3229.770835876465, + "epoch": 0.052571428571428575, + "grad_norm": 0.010699857957661152, + "kl": 0.0001803375780582428, + "learning_rate": 9.2e-07, + "loss": 0.0005, + "reward": -0.06760753598064184, + "reward_advantage_correlation": 1.0, + "reward_std": 0.07461204146966338, + "rewards/cosine_scaled_reward": -0.2723505459725857, + "rewards/format_reward": 0.14583333395421505, + "step": 46 + }, + { + "advantage_max": 0.17866623401641846, + "advantage_mean": -7.6834113516e-09, + "advantage_min": -0.2375992350280285, + "advantage_std": 0.1749500371515751, + "completion_length": 2958.6250610351562, + "epoch": 0.053714285714285714, + "grad_norm": 0.029765864834189415, + "kl": 6.363540887832642e-05, + "learning_rate": 9.399999999999999e-07, + "loss": 0.0121, + "reward": 0.11308036895934492, + "reward_advantage_correlation": 1.0, + "reward_std": 0.17495004460215569, + "rewards/cosine_scaled_reward": 0.07415074668824673, + "rewards/format_reward": 0.5208333432674408, + "step": 47 + }, + { + "advantage_max": 0.19035830302163959, + "advantage_mean": -3.4924599323638006e-10, + "advantage_min": -0.16151767084375024, + "advantage_std": 0.1434162282384932, + "completion_length": 2765.6250228881836, + "epoch": 0.054857142857142854, + "grad_norm": 0.02456662431359291, + "kl": 0.00045037176460027695, + "learning_rate": 9.6e-07, + "loss": 0.0078, + "reward": 0.07018421730026603, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1434162249788642, + "rewards/cosine_scaled_reward": -0.013612153008580208, + "rewards/format_reward": 0.43750000558793545, + "step": 48 + }, + { + "advantage_max": 0.2279358534142375, + "advantage_mean": -1.4745941204208357e-09, + "advantage_min": -0.14564000815153122, + "advantage_std": 0.15678073978051543, + "completion_length": 2342.354232788086, + "epoch": 0.056, + "grad_norm": 0.02502998150885105, + "kl": 0.00013617053627967834, + "learning_rate": 9.8e-07, + "loss": 0.0089, + "reward": 0.06247584073571488, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.15678074583411217, + "rewards/cosine_scaled_reward": -0.09697807114571333, + "rewards/format_reward": 0.5625000037252903, + "step": 49 + }, + { + "advantage_max": 0.122623095754534, + "advantage_mean": -1.3969838619232178e-09, + "advantage_min": -0.1242001224309206, + "advantage_std": 0.1060920343734324, + "completion_length": 2923.5625228881836, + "epoch": 0.05714285714285714, + "grad_norm": 0.021566810086369514, + "kl": 0.00023385882377624512, + "learning_rate": 1e-06, + "loss": 0.0096, + "reward": 0.038516357075423, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10609203530475497, + "rewards/cosine_scaled_reward": -0.03362384531646967, + "rewards/format_reward": 0.29166666977107525, + "step": 50 + }, + { + "advantage_max": 0.10491376649588346, + "advantage_mean": -3.647680130169917e-09, + "advantage_min": -0.16684891190379858, + "advantage_std": 0.10262906912248582, + "completion_length": 2406.3750228881836, + "epoch": 0.05828571428571429, + "grad_norm": 0.02082122303545475, + "kl": 0.000521540641784668, + "learning_rate": 9.999890338174275e-07, + "loss": 0.0026, + "reward": 0.062345280312001705, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10262907319702208, + "rewards/cosine_scaled_reward": -0.05357350129634142, + "rewards/format_reward": 0.47916667722165585, + "step": 51 + }, + { + "advantage_max": 0.23814772348850965, + "advantage_mean": -2.2506961641632905e-09, + "advantage_min": -0.21308327466249466, + "advantage_std": 0.1857591886073351, + "completion_length": 2921.333366394043, + "epoch": 0.05942857142857143, + "grad_norm": 0.026340872049331665, + "kl": 0.00041694939136505127, + "learning_rate": 9.999561358041868e-07, + "loss": 0.0094, + "reward": 0.09964685700833797, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.18575919978320599, + "rewards/cosine_scaled_reward": 0.08640430495142937, + "rewards/format_reward": 0.4166666753590107, + "step": 52 + }, + { + "advantage_max": 0.22019695164635777, + "advantage_mean": -7.761021547647573e-10, + "advantage_min": -0.18540269322693348, + "advantage_std": 0.17194287246093154, + "completion_length": 2839.1250610351562, + "epoch": 0.060571428571428575, + "grad_norm": 0.02775190770626068, + "kl": 0.00034201203379780054, + "learning_rate": 9.999013075636804e-07, + "loss": 0.0087, + "reward": 0.0726233726600185, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.17194287665188313, + "rewards/cosine_scaled_reward": -0.035814402624964714, + "rewards/format_reward": 0.5000000093132257, + "step": 53 + }, + { + "advantage_max": 0.15818986017256975, + "advantage_mean": -8.537123299956484e-10, + "advantage_min": -0.15511069353669882, + "advantage_std": 0.13614196004346013, + "completion_length": 2918.3958892822266, + "epoch": 0.061714285714285715, + "grad_norm": 0.02299380674958229, + "kl": 0.0001678699627518654, + "learning_rate": 9.998245517681593e-07, + "loss": 0.009, + "reward": 0.15431041596457362, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1361419651657343, + "rewards/cosine_scaled_reward": 0.18133503943681717, + "rewards/format_reward": 0.5416666772216558, + "step": 54 + }, + { + "advantage_max": 0.19129236973822117, + "advantage_mean": -2.949188199208308e-09, + "advantage_min": -0.1691391160711646, + "advantage_std": 0.14046459831297398, + "completion_length": 2980.0208892822266, + "epoch": 0.06285714285714286, + "grad_norm": 0.020739315077662468, + "kl": 0.0004740804433822632, + "learning_rate": 9.997258721585931e-07, + "loss": 0.01, + "reward": 0.0497954161837697, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.14046460296958685, + "rewards/cosine_scaled_reward": -0.019190420862287283, + "rewards/format_reward": 0.33333333767950535, + "step": 55 + }, + { + "advantage_max": 0.08986138668842614, + "advantage_mean": -7.450580721823918e-09, + "advantage_min": -0.18162237294018269, + "advantage_std": 0.10682848328724504, + "completion_length": 2916.1666870117188, + "epoch": 0.064, + "grad_norm": 0.0158238485455513, + "kl": 0.0002256631851196289, + "learning_rate": 9.996052735444862e-07, + "loss": 0.007, + "reward": 0.06883894634665921, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.10682848887518048, + "rewards/cosine_scaled_reward": 0.016542285680770874, + "rewards/format_reward": 0.3750000074505806, + "step": 56 + }, + { + "advantage_max": 0.159211162943393, + "advantage_mean": -1.4745940371541089e-09, + "advantage_min": -0.14113077148795128, + "advantage_std": 0.11528732301667333, + "completion_length": 3302.312530517578, + "epoch": 0.06514285714285714, + "grad_norm": 0.01696748286485672, + "kl": 0.00016479287296533585, + "learning_rate": 9.994627618036452e-07, + "loss": 0.0063, + "reward": 0.011191772297024727, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11528732441365719, + "rewards/cosine_scaled_reward": -0.10429812222719193, + "rewards/format_reward": 0.27083334513008595, + "step": 57 + }, + { + "advantage_max": 0.18715458177030087, + "advantage_mean": -5.277494732891519e-09, + "advantage_min": -0.171913824044168, + "advantage_std": 0.13622049521654844, + "completion_length": 2435.8542404174805, + "epoch": 0.06628571428571428, + "grad_norm": 0.019368024542927742, + "kl": 0.0015213489532470703, + "learning_rate": 9.992983438818915e-07, + "loss": 0.0105, + "reward": 0.14305569988209754, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13622050173580647, + "rewards/cosine_scaled_reward": 0.08871240820735693, + "rewards/format_reward": 0.6666666753590107, + "step": 58 + }, + { + "advantage_max": 0.14700328465551138, + "advantage_mean": -1.862645301886623e-09, + "advantage_min": -0.1413423651829362, + "advantage_std": 0.12064886884763837, + "completion_length": 2893.4791870117188, + "epoch": 0.06742857142857143, + "grad_norm": 0.02102687954902649, + "kl": 0.0005481839179992676, + "learning_rate": 9.991120277927223e-07, + "loss": 0.0087, + "reward": 0.034189446829259396, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12064887676388025, + "rewards/cosine_scaled_reward": -0.0558818019926548, + "rewards/format_reward": 0.31250000558793545, + "step": 59 + }, + { + "advantage_max": 0.1409899704158306, + "advantage_mean": -3.880510732190423e-09, + "advantage_min": -0.11852918658405542, + "advantage_std": 0.11182335065677762, + "completion_length": 2964.1459045410156, + "epoch": 0.06857142857142857, + "grad_norm": 0.01700800471007824, + "kl": 0.0003733038902282715, + "learning_rate": 9.989038226169207e-07, + "loss": 0.0015, + "reward": 0.008576460648328066, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.11182335112243891, + "rewards/cosine_scaled_reward": -0.16191277094185352, + "rewards/format_reward": 0.37500000186264515, + "step": 60 + }, + { + "advantage_max": 0.15250339172780514, + "advantage_mean": 2.173086086076914e-09, + "advantage_min": -0.1706991521641612, + "advantage_std": 0.13841882860288024, + "completion_length": 3171.541717529297, + "epoch": 0.06971428571428571, + "grad_norm": 0.03268995136022568, + "kl": 0.0005887793377041817, + "learning_rate": 9.98673738502114e-07, + "loss": 0.014, + "reward": 0.024381998693570495, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.13841883558779955, + "rewards/cosine_scaled_reward": -0.12546866945922375, + "rewards/format_reward": 0.39583334140479565, + "step": 61 + }, + { + "advantage_max": 0.2628666125237942, + "advantage_mean": -1.4745941620541991e-09, + "advantage_min": -0.18349808733910322, + "advantage_std": 0.1721328515559435, + "completion_length": 2700.2500610351562, + "epoch": 0.07085714285714285, + "grad_norm": 0.026928190141916275, + "kl": 0.0014960318803787231, + "learning_rate": 9.98421786662277e-07, + "loss": 0.0187, + "reward": 0.08059305348433554, + "reward_advantage_correlation": 1.0, + "reward_std": 0.17213285621255636, + "rewards/cosine_scaled_reward": -0.04422247753245756, + "rewards/format_reward": 0.5625000074505806, + "step": 62 + }, + { + "advantage_max": 0.17473032884299755, + "advantage_mean": -1.4745942522598199e-09, + "advantage_min": -0.19655942358076572, + "advantage_std": 0.16620324458926916, + "completion_length": 2423.541732788086, + "epoch": 0.072, + "grad_norm": 0.0306819137185812, + "kl": 0.0013110339641571045, + "learning_rate": 9.981479793771866e-07, + "loss": 0.0283, + "reward": 0.0984296789392829, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.16620325110852718, + "rewards/cosine_scaled_reward": 0.008712463080883026, + "rewards/format_reward": 0.5625000093132257, + "step": 63 + }, + { + "advantage_max": 0.16565060429275036, + "advantage_mean": -2.7939678071131624e-09, + "advantage_min": -0.16477500926703215, + "advantage_std": 0.14017474581487477, + "completion_length": 2889.3750610351562, + "epoch": 0.07314285714285715, + "grad_norm": 0.020427115261554718, + "kl": 0.0007784366607666016, + "learning_rate": 9.97852329991824e-07, + "loss": 0.0132, + "reward": 0.05052297201473266, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.14017474791035056, + "rewards/cosine_scaled_reward": -0.07072503212839365, + "rewards/format_reward": 0.4375000074505806, + "step": 64 + }, + { + "advantage_max": 0.127271534409374, + "advantage_mean": -1.3969838619232178e-09, + "advantage_min": -0.0950427707284689, + "advantage_std": 0.08464562566950917, + "completion_length": 2746.354179382324, + "epoch": 0.07428571428571429, + "grad_norm": 0.013379656709730625, + "kl": 0.0009982585906982422, + "learning_rate": 9.975348529157229e-07, + "loss": 0.0042, + "reward": 0.03162489866372198, + "reward_advantage_correlation": 0.9999999999999997, + "reward_std": 0.08464562753215432, + "rewards/cosine_scaled_reward": -0.11611694470047951, + "rewards/format_reward": 0.41666666977107525, + "step": 65 + }, + { + "advantage_max": 0.149964171461761, + "advantage_mean": -3.5700701006557978e-09, + "advantage_min": -0.13461025152355433, + "advantage_std": 0.10918906982988119, + "completion_length": 2079.6041946411133, + "epoch": 0.07542857142857143, + "grad_norm": 0.014194848015904427, + "kl": 0.0012865066528320312, + "learning_rate": 9.971955636222684e-07, + "loss": 0.004, + "reward": 0.11370784028986236, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10918907588347793, + "rewards/cosine_scaled_reward": 0.05414431728422642, + "rewards/format_reward": 0.5625000018626451, + "step": 66 + }, + { + "advantage_max": 0.08563538501039147, + "advantage_mean": 2.1730860513824446e-09, + "advantage_min": -0.07122973585501313, + "advantage_std": 0.05874074366874993, + "completion_length": 3511.4375, + "epoch": 0.07657142857142857, + "grad_norm": 0.008974037133157253, + "kl": 0.0012157298624515533, + "learning_rate": 9.968344786479415e-07, + "loss": 0.0014, + "reward": -0.08041232687537558, + "reward_advantage_correlation": 1.0, + "reward_std": 0.05874074646271765, + "rewards/cosine_scaled_reward": -0.2987704258412123, + "rewards/format_reward": 0.1250000037252903, + "step": 67 + }, + { + "advantage_max": 0.1881636488251388, + "advantage_mean": -1.2417634281947088e-09, + "advantage_min": -0.17326833494007587, + "advantage_std": 0.16107123950496316, + "completion_length": 2121.895866394043, + "epoch": 0.07771428571428571, + "grad_norm": 0.02590767852962017, + "kl": 0.004287242889404297, + "learning_rate": 9.964516155915151e-07, + "loss": 0.0219, + "reward": 0.07258323905989528, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1610712418332696, + "rewards/cosine_scaled_reward": -0.08903148956596851, + "rewards/format_reward": 0.6041666716337204, + "step": 68 + }, + { + "advantage_max": 0.17149817757308483, + "advantage_mean": 6.208817349140361e-10, + "advantage_min": -0.09525131899863482, + "advantage_std": 0.1044535138644278, + "completion_length": 2714.416748046875, + "epoch": 0.07885714285714286, + "grad_norm": 0.017063690349459648, + "kl": 0.0025910139083862305, + "learning_rate": 9.960469931131936e-07, + "loss": 0.0049, + "reward": -0.0257909067440778, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10445351805537939, + "rewards/cosine_scaled_reward": -0.26491281390190125, + "rewards/format_reward": 0.3750000037252903, + "step": 69 + }, + { + "advantage_max": 0.13912878511473536, + "advantage_mean": -1.5522043234073024e-09, + "advantage_min": -0.1294058826752007, + "advantage_std": 0.11556470859795809, + "completion_length": 3043.625015258789, + "epoch": 0.08, + "grad_norm": 0.02166864648461342, + "kl": 0.0016424953937530518, + "learning_rate": 9.956206309337066e-07, + "loss": 0.0011, + "reward": 0.030413513217354193, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11556471651419997, + "rewards/cosine_scaled_reward": -0.11716132145375013, + "rewards/format_reward": 0.4166666716337204, + "step": 70 + }, + { + "advantage_max": 0.1449303338304162, + "advantage_mean": -8.537123299956484e-10, + "advantage_min": -0.12476017605513334, + "advantage_std": 0.11173301562666893, + "completion_length": 2603.7291717529297, + "epoch": 0.08114285714285714, + "grad_norm": 0.012032200582325459, + "kl": 0.0020183324813842773, + "learning_rate": 9.951725498333448e-07, + "loss": 0.0006, + "reward": 0.041470743250101805, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11173302121460438, + "rewards/cosine_scaled_reward": -0.06599017698317766, + "rewards/format_reward": 0.375, + "step": 71 + }, + { + "advantage_max": 0.1914712623693049, + "advantage_mean": -1.2417634420724966e-09, + "advantage_min": -0.14899978134781122, + "advantage_std": 0.12655010493472219, + "completion_length": 3024.2083740234375, + "epoch": 0.08228571428571428, + "grad_norm": 0.025330260396003723, + "kl": 0.0027605295181274414, + "learning_rate": 9.947027716509488e-07, + "loss": 0.0173, + "reward": -0.00684193754568696, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12655011098831892, + "rewards/cosine_scaled_reward": -0.18876908160746098, + "rewards/format_reward": 0.33333334140479565, + "step": 72 + }, + { + "advantage_max": 0.18401953671127558, + "advantage_mean": 9.31322616248842e-10, + "advantage_min": -0.12465803744271398, + "advantage_std": 0.12810300663113594, + "completion_length": 3407.2500610351562, + "epoch": 0.08342857142857144, + "grad_norm": 0.023279855027794838, + "kl": 0.0006111264228820801, + "learning_rate": 9.942113192828444e-07, + "loss": 0.0084, + "reward": -0.0037489386450033635, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12810300663113594, + "rewards/cosine_scaled_reward": -0.12534647807478905, + "rewards/format_reward": 0.2291666679084301, + "step": 73 + }, + { + "advantage_max": 0.12516837287694216, + "advantage_mean": -1.6686196174786616e-09, + "advantage_min": -0.1078806221485138, + "advantage_std": 0.09417479066178203, + "completion_length": 3254.875030517578, + "epoch": 0.08457142857142858, + "grad_norm": 0.019327718764543533, + "kl": 0.0016609132289886475, + "learning_rate": 9.93698216681727e-07, + "loss": 0.0035, + "reward": 0.026729536708444357, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09417479438707232, + "rewards/cosine_scaled_reward": -0.055625975131988525, + "rewards/format_reward": 0.27083333767950535, + "step": 74 + }, + { + "advantage_max": 0.14641679031774402, + "advantage_mean": -7.528190813788083e-09, + "advantage_min": -0.13670456875115633, + "advantage_std": 0.11592453811317682, + "completion_length": 3050.3541870117188, + "epoch": 0.08571428571428572, + "grad_norm": 0.018251126632094383, + "kl": 0.0018388032913208008, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0044, + "reward": 0.07386055216193199, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11592454044148326, + "rewards/cosine_scaled_reward": 0.03964068624190986, + "rewards/format_reward": 0.3541666753590107, + "step": 75 + }, + { + "advantage_max": 0.11959696374833584, + "advantage_mean": 3.1044086745701804e-10, + "advantage_min": -0.11977787129580975, + "advantage_std": 0.10093671828508377, + "completion_length": 2620.500045776367, + "epoch": 0.08685714285714285, + "grad_norm": 0.02156541682779789, + "kl": 0.0006852000951766968, + "learning_rate": 9.926071618660237e-07, + "loss": 0.0103, + "reward": 0.026126212440431118, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10093672387301922, + "rewards/cosine_scaled_reward": -0.18299918808043003, + "rewards/format_reward": 0.5208333469927311, + "step": 76 + }, + { + "advantage_max": 0.1113869184628129, + "advantage_mean": -4.287964372462483e-09, + "advantage_min": -0.1384631348773837, + "advantage_std": 0.0967109005432576, + "completion_length": 3143.729217529297, + "epoch": 0.088, + "grad_norm": 0.016500068828463554, + "kl": 0.001096084713935852, + "learning_rate": 9.9202926282791e-07, + "loss": -0.0054, + "reward": 0.04420704103540629, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.09671090613119304, + "rewards/cosine_scaled_reward": -0.058531179500278085, + "rewards/format_reward": 0.37500000931322575, + "step": 77 + }, + { + "advantage_max": 0.16691427025943995, + "advantage_mean": 1.2417634420724966e-09, + "advantage_min": -0.1529896855354309, + "advantage_std": 0.13225172739475965, + "completion_length": 3109.354202270508, + "epoch": 0.08914285714285715, + "grad_norm": 0.021866677328944206, + "kl": 0.0009225308895111084, + "learning_rate": 9.91429819907136e-07, + "loss": 0.0065, + "reward": 0.03236245736479759, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13225173251703382, + "rewards/cosine_scaled_reward": -0.09289113059639931, + "rewards/format_reward": 0.3750000074505806, + "step": 78 + }, + { + "advantage_max": 0.15700082294642925, + "advantage_mean": 1.125348175756713e-09, + "advantage_min": -0.10489925090223551, + "advantage_std": 0.10893942508846521, + "completion_length": 2253.1250534057617, + "epoch": 0.09028571428571429, + "grad_norm": 0.017613250762224197, + "kl": 0.0023194551467895508, + "learning_rate": 9.908088623197048e-07, + "loss": -0.0019, + "reward": 0.0731448968872428, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1089394255541265, + "rewards/cosine_scaled_reward": -0.08033954538404942, + "rewards/format_reward": 0.5833333358168602, + "step": 79 + }, + { + "advantage_max": 0.15978450048714876, + "advantage_mean": 4.6566125261326974e-10, + "advantage_min": -0.13024994870647788, + "advantage_std": 0.11621860601007938, + "completion_length": 3283.041717529297, + "epoch": 0.09142857142857143, + "grad_norm": 0.019476035609841347, + "kl": 0.0017393827438354492, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0064, + "reward": -0.002654203213751316, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11621860833838582, + "rewards/cosine_scaled_reward": -0.1530076563358307, + "rewards/format_reward": 0.29166666977107525, + "step": 80 + }, + { + "advantage_max": 0.1000084918923676, + "advantage_mean": -5.432714486608425e-10, + "advantage_min": -0.12673839554190636, + "advantage_std": 0.0900690802372992, + "completion_length": 3145.875030517578, + "epoch": 0.09257142857142857, + "grad_norm": 0.016789212822914124, + "kl": 0.005313873291015625, + "learning_rate": 9.895025252503755e-07, + "loss": 0.0054, + "reward": 0.0005171550437808037, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09006908535957336, + "rewards/cosine_scaled_reward": -0.1340514589101076, + "rewards/format_reward": 0.2708333432674408, + "step": 81 + }, + { + "advantage_max": 0.16784677654504776, + "advantage_mean": -6.519258133330652e-09, + "advantage_min": -0.13112404569983482, + "advantage_std": 0.12467234069481492, + "completion_length": 2761.5208435058594, + "epoch": 0.09371428571428571, + "grad_norm": 0.02522902935743332, + "kl": 0.0028878003358840942, + "learning_rate": 9.888172094375033e-07, + "loss": 0.0019, + "reward": 0.0789759517647326, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12467234022915363, + "rewards/cosine_scaled_reward": 0.024436804931610823, + "rewards/format_reward": 0.4166666716337204, + "step": 82 + }, + { + "advantage_max": 0.1500550713390112, + "advantage_mean": -8.537122953011789e-10, + "advantage_min": -0.13098668679594994, + "advantage_std": 0.10632651299238205, + "completion_length": 2842.187530517578, + "epoch": 0.09485714285714286, + "grad_norm": 0.017034098505973816, + "kl": 0.0017483234405517578, + "learning_rate": 9.881105062929221e-07, + "loss": 0.0024, + "reward": 0.019357941579073668, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.10632651718333364, + "rewards/cosine_scaled_reward": -0.12056602351367474, + "rewards/format_reward": 0.3541666679084301, + "step": 83 + }, + { + "advantage_max": 0.1495693982578814, + "advantage_mean": -1.474594099604154e-09, + "advantage_min": -0.19178100768476725, + "advantage_std": 0.1472853058949113, + "completion_length": 3046.791717529297, + "epoch": 0.096, + "grad_norm": 0.027498627081513405, + "kl": 0.001116037368774414, + "learning_rate": 9.873824502603459e-07, + "loss": 0.0056, + "reward": 0.10806799679994583, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1472853091545403, + "rewards/cosine_scaled_reward": 0.09858678560703993, + "rewards/format_reward": 0.43750001303851604, + "step": 84 + }, + { + "advantage_max": 0.2928034896031022, + "advantage_mean": 8.537124340790569e-10, + "advantage_min": -0.21211090218275785, + "advantage_std": 0.20124134561046958, + "completion_length": 2936.0209045410156, + "epoch": 0.09714285714285714, + "grad_norm": 0.027292873710393906, + "kl": 0.0011175870895385742, + "learning_rate": 9.866330768241983e-07, + "loss": 0.011, + "reward": 0.09737453208072111, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.20124135678634048, + "rewards/cosine_scaled_reward": 0.017047576373443007, + "rewards/format_reward": 0.5416666772216558, + "step": 85 + }, + { + "advantage_max": 0.161500733345747, + "advantage_mean": 2.949188199208308e-09, + "advantage_min": -0.12576205004006624, + "advantage_std": 0.11738913925364614, + "completion_length": 3064.6042098999023, + "epoch": 0.09828571428571428, + "grad_norm": 0.019446710124611855, + "kl": 0.0017948150634765625, + "learning_rate": 9.85862422507884e-07, + "loss": 0.0081, + "reward": 0.030952767468988895, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11738914204761386, + "rewards/cosine_scaled_reward": -0.08462417311966419, + "rewards/format_reward": 0.3541666679084301, + "step": 86 + }, + { + "advantage_max": 0.1681693554855883, + "advantage_mean": -1.8626452047421083e-09, + "advantage_min": -0.1518132919445634, + "advantage_std": 0.1314193387515843, + "completion_length": 2709.166702270508, + "epoch": 0.09942857142857142, + "grad_norm": 0.027268240228295326, + "kl": 0.0031093358993530273, + "learning_rate": 9.850705248720068e-07, + "loss": 0.0161, + "reward": 0.03507482446730137, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1314193462021649, + "rewards/cosine_scaled_reward": -0.12755973311141133, + "rewards/format_reward": 0.45833334885537624, + "step": 87 + }, + { + "advantage_max": 0.27509157080203295, + "advantage_mean": -3.570070003511283e-09, + "advantage_min": -0.19592854473739862, + "advantage_std": 0.1789478063583374, + "completion_length": 2636.250045776367, + "epoch": 0.10057142857142858, + "grad_norm": 0.03065653145313263, + "kl": 0.0032744407653808594, + "learning_rate": 9.8425742251254e-07, + "loss": 0.0206, + "reward": 0.08256556163541973, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.17894781846553087, + "rewards/cosine_scaled_reward": 0.014251076150685549, + "rewards/format_reward": 0.4583333469927311, + "step": 88 + }, + { + "advantage_max": 0.16426204703748226, + "advantage_mean": -2.638747317873502e-09, + "advantage_min": -0.1488904170691967, + "advantage_std": 0.12655864632688463, + "completion_length": 3053.6041717529297, + "epoch": 0.10171428571428572, + "grad_norm": 0.02547341212630272, + "kl": 0.002205371856689453, + "learning_rate": 9.83423155058946e-07, + "loss": 0.0104, + "reward": 0.027745387284085155, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.126558649353683, + "rewards/cosine_scaled_reward": -0.09617769811302423, + "rewards/format_reward": 0.3541666716337204, + "step": 89 + }, + { + "advantage_max": 0.10391614772379398, + "advantage_mean": -1.3193736519978572e-09, + "advantage_min": -0.10397443547844887, + "advantage_std": 0.07875827606767416, + "completion_length": 2370.375015258789, + "epoch": 0.10285714285714286, + "grad_norm": 0.015140805393457413, + "kl": 0.004084110260009766, + "learning_rate": 9.825677631722435e-07, + "loss": 0.0056, + "reward": -0.006435986841097474, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.0787582783959806, + "rewards/cosine_scaled_reward": -0.2495388761162758, + "rewards/format_reward": 0.45833334140479565, + "step": 90 + }, + { + "advantage_max": 0.1725978935137391, + "advantage_mean": 3.02679846464482e-09, + "advantage_min": -0.14624899346381426, + "advantage_std": 0.13377743028104305, + "completion_length": 3174.0416870117188, + "epoch": 0.104, + "grad_norm": 0.0243070051074028, + "kl": 0.0019826889038085938, + "learning_rate": 9.816912885430258e-07, + "loss": 0.0033, + "reward": 0.021438519936054945, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13377743400633335, + "rewards/cosine_scaled_reward": -0.09193318895995617, + "rewards/format_reward": 0.31250000558793545, + "step": 91 + }, + { + "advantage_max": 0.14272566698491573, + "advantage_mean": -1.7850349948167477e-09, + "advantage_min": -0.11759363766759634, + "advantage_std": 0.10569121921434999, + "completion_length": 2625.6667098999023, + "epoch": 0.10514285714285715, + "grad_norm": 0.0243788193911314, + "kl": 0.0036519765853881836, + "learning_rate": 9.807937738894303e-07, + "loss": -0.0039, + "reward": 0.034875532728619874, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.10569121642038226, + "rewards/cosine_scaled_reward": -0.15802248800173402, + "rewards/format_reward": 0.5208333376795053, + "step": 92 + }, + { + "advantage_max": 0.10833407612517476, + "advantage_mean": 1.3193736728145389e-09, + "advantage_min": -0.09252861887216568, + "advantage_std": 0.07854902278631926, + "completion_length": 3483.5208740234375, + "epoch": 0.10628571428571429, + "grad_norm": 0.013983568176627159, + "kl": 0.0025484561920166016, + "learning_rate": 9.798752629550546e-07, + "loss": 0.0054, + "reward": -0.07466757856309414, + "reward_advantage_correlation": 1.0, + "reward_std": 0.07854902278631926, + "rewards/cosine_scaled_reward": -0.2516121231019497, + "rewards/format_reward": 0.06250000186264515, + "step": 93 + }, + { + "advantage_max": 0.16513420641422272, + "advantage_mean": 9.313226023710541e-10, + "advantage_min": -0.10861359536647797, + "advantage_std": 0.10586209408938885, + "completion_length": 3181.625030517578, + "epoch": 0.10742857142857143, + "grad_norm": 0.019842853769659996, + "kl": 0.0038785934448242188, + "learning_rate": 9.78935800506826e-07, + "loss": 0.0048, + "reward": 0.023791223531588912, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10586209432221949, + "rewards/cosine_scaled_reward": -0.043594514252617955, + "rewards/format_reward": 0.22916666977107525, + "step": 94 + }, + { + "advantage_max": 0.15752420481294394, + "advantage_mean": 1.3969838966176873e-09, + "advantage_min": -0.10875887889415026, + "advantage_std": 0.09435309539549053, + "completion_length": 3359.4583740234375, + "epoch": 0.10857142857142857, + "grad_norm": 0.012666534632444382, + "kl": 0.0016287565231323242, + "learning_rate": 9.779754323328192e-07, + "loss": 0.004, + "reward": -0.032025402411818504, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09435309586115181, + "rewards/cosine_scaled_reward": -0.18826766480924562, + "rewards/format_reward": 0.18750000186264515, + "step": 95 + }, + { + "advantage_max": 0.16466173576191068, + "advantage_mean": 5.122274243651859e-09, + "advantage_min": -0.14734835969284177, + "advantage_std": 0.11930368887260556, + "completion_length": 2774.3333740234375, + "epoch": 0.10971428571428571, + "grad_norm": 0.020811520516872406, + "kl": 0.004055976867675781, + "learning_rate": 9.769942052400235e-07, + "loss": 0.0012, + "reward": 0.07045774557627738, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11930369678884745, + "rewards/cosine_scaled_reward": -0.010209089145064354, + "rewards/format_reward": 0.43750000186264515, + "step": 96 + }, + { + "advantage_max": 0.20686533208936453, + "advantage_mean": 5.551115123125783e-17, + "advantage_min": -0.1880991030484438, + "advantage_std": 0.1569270808249712, + "completion_length": 3132.5834350585938, + "epoch": 0.11085714285714286, + "grad_norm": 0.023457398638129234, + "kl": 0.0023653507232666016, + "learning_rate": 9.759921670520634e-07, + "loss": 0.0108, + "reward": 0.06218179999268614, + "reward_advantage_correlation": 1.0, + "reward_std": 0.15692708734422922, + "rewards/cosine_scaled_reward": -0.03378633502870798, + "rewards/format_reward": 0.43750001303851604, + "step": 97 + }, + { + "advantage_max": 0.14833640353754163, + "advantage_mean": 3.1044086745701804e-10, + "advantage_min": -0.12552594719454646, + "advantage_std": 0.10549607453867793, + "completion_length": 2822.7708587646484, + "epoch": 0.112, + "grad_norm": 0.019470063969492912, + "kl": 0.0015528202056884766, + "learning_rate": 9.749693666068663e-07, + "loss": 0.0103, + "reward": 0.03668228443711996, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10549607826396823, + "rewards/cosine_scaled_reward": -0.12178766075521708, + "rewards/format_reward": 0.45833334140479565, + "step": 98 + }, + { + "advantage_max": 0.11012635566294193, + "advantage_mean": 4.1909516412808046e-09, + "advantage_min": -0.11911307182163, + "advantage_std": 0.09590944508090615, + "completion_length": 2851.645835876465, + "epoch": 0.11314285714285714, + "grad_norm": 0.018691029399633408, + "kl": 0.00240325927734375, + "learning_rate": 9.739258537542835e-07, + "loss": -0.0003, + "reward": 0.009538065176457167, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09590945346280932, + "rewards/cosine_scaled_reward": -0.1280885050073266, + "rewards/format_reward": 0.31250000186264515, + "step": 99 + }, + { + "advantage_max": 0.13672667369246483, + "advantage_mean": -2.638747401140229e-09, + "advantage_min": -0.13193287048488855, + "advantage_std": 0.10591222485527396, + "completion_length": 2591.270866394043, + "epoch": 0.11428571428571428, + "grad_norm": 0.018864328041672707, + "kl": 0.0028095245361328125, + "learning_rate": 9.728616793536587e-07, + "loss": -0.0009, + "reward": 0.10221519600600004, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1059122271835804, + "rewards/cosine_scaled_reward": 0.042956192046403885, + "rewards/format_reward": 0.520833333954215, + "step": 100 + }, + { + "advantage_max": 0.093271154910326, + "advantage_mean": -2.7755575615628914e-17, + "advantage_min": -0.09792666789144278, + "advantage_std": 0.08075838536024094, + "completion_length": 2602.7916870117188, + "epoch": 0.11542857142857142, + "grad_norm": 0.016473442316055298, + "kl": 0.0017747879028320312, + "learning_rate": 9.717768952713511e-07, + "loss": 0.0086, + "reward": 0.05790668725967407, + "reward_advantage_correlation": 1.0, + "reward_std": 0.08075838536024094, + "rewards/cosine_scaled_reward": -0.05933393910527229, + "rewards/format_reward": 0.45833333395421505, + "step": 101 + }, + { + "advantage_max": 0.24052445031702518, + "advantage_mean": -1.2417634698280722e-09, + "advantage_min": -0.19093013741075993, + "advantage_std": 0.16782204061746597, + "completion_length": 2524.7708892822266, + "epoch": 0.11657142857142858, + "grad_norm": 0.035958126187324524, + "kl": 0.004951953887939453, + "learning_rate": 9.706715543782064e-07, + "loss": 0.0201, + "reward": 0.056061833864077926, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1678220392204821, + "rewards/cosine_scaled_reward": -0.10558513924479485, + "rewards/format_reward": 0.5416666772216558, + "step": 102 + }, + { + "advantage_max": 0.185080180875957, + "advantage_mean": -3.414849569782774e-09, + "advantage_min": -0.14461032394319773, + "advantage_std": 0.13952101161703467, + "completion_length": 2854.5416984558105, + "epoch": 0.11771428571428572, + "grad_norm": 0.01984941028058529, + "kl": 0.0026073455810546875, + "learning_rate": 9.695457105469804e-07, + "loss": 0.012, + "reward": 0.025294456630945206, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.13952100928872824, + "rewards/cosine_scaled_reward": -0.11525619064923376, + "rewards/format_reward": 0.37500000558793545, + "step": 103 + }, + { + "advantage_max": 0.08083041338250041, + "advantage_mean": -2.716357590248908e-09, + "advantage_min": -0.10733875446021557, + "advantage_std": 0.07301557017490268, + "completion_length": 2621.395851135254, + "epoch": 0.11885714285714286, + "grad_norm": 0.11099490523338318, + "kl": 0.0036411285400390625, + "learning_rate": 9.683994186497132e-07, + "loss": 0.0006, + "reward": 0.03443576395511627, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.07301557110622525, + "rewards/cosine_scaled_reward": -0.09871989954262972, + "rewards/format_reward": 0.39583333395421505, + "step": 104 + }, + { + "advantage_max": 0.2338110376149416, + "advantage_mean": -6.51925827210853e-09, + "advantage_min": -0.2026761043816805, + "advantage_std": 0.18882041098549962, + "completion_length": 2379.8958587646484, + "epoch": 0.12, + "grad_norm": 0.02698267251253128, + "kl": 0.0016281604766845703, + "learning_rate": 9.672327345550543e-07, + "loss": 0.0194, + "reward": 0.11758742481470108, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.18882041005417705, + "rewards/cosine_scaled_reward": 0.06502276286482811, + "rewards/format_reward": 0.5625000074505806, + "step": 105 + }, + { + "advantage_max": 0.11389932315796614, + "advantage_mean": -4.811833403950416e-09, + "advantage_min": -0.15608873032033443, + "advantage_std": 0.11125539761269465, + "completion_length": 2167.5833702087402, + "epoch": 0.12114285714285715, + "grad_norm": 0.020373547449707985, + "kl": 0.0024791955947875977, + "learning_rate": 9.66045715125541e-07, + "loss": 0.0012, + "reward": 0.20540974102914333, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11125540168723091, + "rewards/cosine_scaled_reward": 0.22056686785072088, + "rewards/format_reward": 0.7708333432674408, + "step": 106 + }, + { + "advantage_max": 0.10643723327666521, + "advantage_mean": 3.8805109126016646e-10, + "advantage_min": -0.11985098151490092, + "advantage_std": 0.09828559448942542, + "completion_length": 2751.604179382324, + "epoch": 0.12228571428571429, + "grad_norm": 0.018458819016814232, + "kl": 0.0025892257690429688, + "learning_rate": 9.648384182148252e-07, + "loss": 0.0116, + "reward": 0.025896158069372177, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09828559728339314, + "rewards/cosine_scaled_reward": -0.16178596578538418, + "rewards/format_reward": 0.4791666753590107, + "step": 107 + }, + { + "advantage_max": 0.16523229470476508, + "advantage_mean": -1.2417635114614356e-09, + "advantage_min": -0.11955348215997219, + "advantage_std": 0.11437350790947676, + "completion_length": 2345.145866394043, + "epoch": 0.12342857142857143, + "grad_norm": 0.015816396102309227, + "kl": 0.0018943548202514648, + "learning_rate": 9.636109026648554e-07, + "loss": 0.0099, + "reward": 0.08748990359163145, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.11437350790947676, + "rewards/cosine_scaled_reward": -0.044466909021139145, + "rewards/format_reward": 0.6041666772216558, + "step": 108 + }, + { + "advantage_max": 0.10261640883982182, + "advantage_mean": -3.0656037708354233e-09, + "advantage_min": -0.07673678267747164, + "advantage_std": 0.07046977989375591, + "completion_length": 2884.791702270508, + "epoch": 0.12457142857142857, + "grad_norm": 0.012454885058104992, + "kl": 0.0016399621963500977, + "learning_rate": 9.623632283030077e-07, + "loss": 0.0055, + "reward": 0.007819185324478894, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.07046978268772364, + "rewards/cosine_scaled_reward": -0.15225903131067753, + "rewards/format_reward": 0.3541666679084301, + "step": 109 + }, + { + "advantage_max": 0.19972685351967812, + "advantage_mean": -1.474594168993093e-09, + "advantage_min": -0.1611551959067583, + "advantage_std": 0.1318470723927021, + "completion_length": 2799.854263305664, + "epoch": 0.12571428571428572, + "grad_norm": 0.022104663774371147, + "kl": 0.0022640228271484375, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0006, + "reward": 0.06127751222811639, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.13184707635082304, + "rewards/cosine_scaled_reward": -0.07033403514651582, + "rewards/format_reward": 0.5000000074505806, + "step": 110 + }, + { + "advantage_max": 0.1784421824850142, + "advantage_mean": 8.537123924456935e-10, + "advantage_min": -0.1500476342625916, + "advantage_std": 0.1379630877636373, + "completion_length": 3026.3334045410156, + "epoch": 0.12685714285714286, + "grad_norm": 0.023240555077791214, + "kl": 0.004309654235839844, + "learning_rate": 9.598076473627796e-07, + "loss": 0.0047, + "reward": 0.0340226587431971, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13796309987083077, + "rewards/cosine_scaled_reward": -0.08653214666992426, + "rewards/format_reward": 0.375, + "step": 111 + }, + { + "advantage_max": 0.2080360697582364, + "advantage_mean": 2.5611371773370806e-09, + "advantage_min": -0.1433120183646679, + "advantage_std": 0.1446642866358161, + "completion_length": 2837.750030517578, + "epoch": 0.128, + "grad_norm": 0.019071072340011597, + "kl": 0.0014045238494873047, + "learning_rate": 9.58499865339809e-07, + "loss": 0.0012, + "reward": 0.08503166912123561, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1446642787195742, + "rewards/cosine_scaled_reward": 0.011173800681717694, + "rewards/format_reward": 0.47916666977107525, + "step": 112 + }, + { + "advantage_max": 0.19163453578948975, + "advantage_mean": -2.1730860721991263e-09, + "advantage_min": -0.1518814405426383, + "advantage_std": 0.14186442783102393, + "completion_length": 2613.0625762939453, + "epoch": 0.12914285714285714, + "grad_norm": 0.0337207056581974, + "kl": 0.0030469894409179688, + "learning_rate": 9.571721736097088e-07, + "loss": 0.0203, + "reward": 0.04671849589794874, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.14186443528160453, + "rewards/cosine_scaled_reward": -0.1140240803360939, + "rewards/format_reward": 0.5000000111758709, + "step": 113 + }, + { + "advantage_max": 0.13480149395763874, + "advantage_mean": -3.4924596686858322e-09, + "advantage_min": -0.09827340161427855, + "advantage_std": 0.08871775027364492, + "completion_length": 2424.5208892822266, + "epoch": 0.13028571428571428, + "grad_norm": 0.013029958121478558, + "kl": 0.0030794143676757812, + "learning_rate": 9.55824636882301e-07, + "loss": 0.0055, + "reward": 0.034493221901357174, + "reward_advantage_correlation": 1.0, + "reward_std": 0.08871775213629007, + "rewards/cosine_scaled_reward": -0.22131562419235706, + "rewards/format_reward": 0.6458333395421505, + "step": 114 + }, + { + "advantage_max": 0.10426800954155624, + "advantage_mean": 1.9014502611325312e-09, + "advantage_min": -0.1240433705970645, + "advantage_std": 0.08707842836156487, + "completion_length": 2850.9583587646484, + "epoch": 0.13142857142857142, + "grad_norm": 0.011764142662286758, + "kl": 0.003220081329345703, + "learning_rate": 9.54457320834625e-07, + "loss": 0.0007, + "reward": 0.02795394801069051, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.08707842929288745, + "rewards/cosine_scaled_reward": -0.08467382844537497, + "rewards/format_reward": 0.3333333358168602, + "step": 115 + }, + { + "advantage_max": 0.1957715330645442, + "advantage_mean": -7.761021686425451e-10, + "advantage_min": -0.10577770043164492, + "advantage_std": 0.11993355210870504, + "completion_length": 3311.604202270508, + "epoch": 0.13257142857142856, + "grad_norm": 0.02515524998307228, + "kl": 0.0029854774475097656, + "learning_rate": 9.530702921077358e-07, + "loss": -0.0008, + "reward": -0.030864793108776212, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.11993355257436633, + "rewards/cosine_scaled_reward": -0.17528742615832016, + "rewards/format_reward": 0.16666667349636555, + "step": 116 + }, + { + "advantage_max": 0.18044267036020756, + "advantage_mean": 3.88051125954636e-10, + "advantage_min": -0.11745557747781277, + "advantage_std": 0.10928188590332866, + "completion_length": 2841.6041717529297, + "epoch": 0.1337142857142857, + "grad_norm": 0.019817935302853584, + "kl": 0.0042095184326171875, + "learning_rate": 9.516636183034564e-07, + "loss": 0.003, + "reward": 0.01119667274178937, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10928189102560282, + "rewards/cosine_scaled_reward": -0.17523309215903282, + "rewards/format_reward": 0.416666679084301, + "step": 117 + }, + { + "advantage_max": 0.21986545948311687, + "advantage_mean": -4.268561920595104e-09, + "advantage_min": -0.14438048377633095, + "advantage_std": 0.14283119468018413, + "completion_length": 2871.166732788086, + "epoch": 0.13485714285714287, + "grad_norm": 0.024016261100769043, + "kl": 0.0021152496337890625, + "learning_rate": 9.502373679810839e-07, + "loss": 0.0001, + "reward": 0.15214395127259195, + "reward_advantage_correlation": 1.0, + "reward_std": 0.142831196077168, + "rewards/cosine_scaled_reward": 0.16847316874191165, + "rewards/format_reward": 0.5625000037252903, + "step": 118 + }, + { + "advantage_max": 0.15783230029046535, + "advantage_mean": -5.1222742575296465e-09, + "advantage_min": -0.14731642603874207, + "advantage_std": 0.12371453363448381, + "completion_length": 2522.604202270508, + "epoch": 0.136, + "grad_norm": 0.01609306037425995, + "kl": 0.004221200942993164, + "learning_rate": 9.487916106540465e-07, + "loss": 0.0129, + "reward": 0.09867640398442745, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12371453177183867, + "rewards/cosine_scaled_reward": -0.0007585976272821426, + "rewards/format_reward": 0.5833333395421505, + "step": 119 + }, + { + "advantage_max": 0.20129030477255583, + "advantage_mean": -8.537124063234813e-10, + "advantage_min": -0.12198320962488651, + "advantage_std": 0.12838405929505825, + "completion_length": 2559.8958740234375, + "epoch": 0.13714285714285715, + "grad_norm": 0.04911419749259949, + "kl": 0.004084587097167969, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0137, + "reward": 0.06981711252592504, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12838406348600984, + "rewards/cosine_scaled_reward": -0.044595762854442, + "rewards/format_reward": 0.5000000074505806, + "step": 120 + }, + { + "advantage_max": 0.1426694355905056, + "advantage_mean": -6.907309335613121e-09, + "advantage_min": -0.16301002446562052, + "advantage_std": 0.12418079562485218, + "completion_length": 1872.1042251586914, + "epoch": 0.1382857142857143, + "grad_norm": 0.040391478687524796, + "kl": 0.0055408477783203125, + "learning_rate": 9.458418577899774e-07, + "loss": 0.0151, + "reward": 0.12901237746700644, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12418079702183604, + "rewards/cosine_scaled_reward": 0.0047687627375125885, + "rewards/format_reward": 0.7500000074505806, + "step": 121 + }, + { + "advantage_max": 0.17488223453983665, + "advantage_mean": -4.113341472988807e-09, + "advantage_min": -0.20183194149285555, + "advantage_std": 0.1446224031969905, + "completion_length": 2870.270896911621, + "epoch": 0.13942857142857143, + "grad_norm": 0.043231695890426636, + "kl": 0.0026760101318359375, + "learning_rate": 9.443380060197385e-07, + "loss": 0.0071, + "reward": 0.07613519253209233, + "reward_advantage_correlation": 1.0, + "reward_std": 0.14462240180000663, + "rewards/cosine_scaled_reward": 0.00641607865691185, + "rewards/format_reward": 0.4375000111758709, + "step": 122 + }, + { + "advantage_max": 0.1747486158274114, + "advantage_mean": 8.537123646901179e-10, + "advantage_min": -0.15847238339483738, + "advantage_std": 0.13815004844218493, + "completion_length": 2630.8959197998047, + "epoch": 0.14057142857142857, + "grad_norm": 0.026115527376532555, + "kl": 0.002586841583251953, + "learning_rate": 9.428149347714143e-07, + "loss": 0.0165, + "reward": 0.04064048221334815, + "reward_advantage_correlation": 0.9999999999999997, + "reward_std": 0.13815005496144295, + "rewards/cosine_scaled_reward": -0.15376391634345055, + "rewards/format_reward": 0.5416666828095913, + "step": 123 + }, + { + "advantage_max": 0.2104870369657874, + "advantage_mean": -1.4745941412375174e-09, + "advantage_min": -0.16038642171770334, + "advantage_std": 0.14703476894646883, + "completion_length": 2258.8125228881836, + "epoch": 0.1417142857142857, + "grad_norm": 0.023247675970196724, + "kl": 0.005304813385009766, + "learning_rate": 9.412727182773486e-07, + "loss": 0.0087, + "reward": 0.061627675080671906, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.14703477453440428, + "rewards/cosine_scaled_reward": -0.1097550387494266, + "rewards/format_reward": 0.5833333414047956, + "step": 124 + }, + { + "advantage_max": 0.16207702737301588, + "advantage_mean": -1.7074247571358114e-09, + "advantage_min": -0.12357809208333492, + "advantage_std": 0.11802704073488712, + "completion_length": 2823.6875228881836, + "epoch": 0.14285714285714285, + "grad_norm": 0.01639670506119728, + "kl": 0.0029687881469726562, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0048, + "reward": 0.0728115017991513, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11802704306319356, + "rewards/cosine_scaled_reward": 0.035107508301734924, + "rewards/format_reward": 0.3541666679084301, + "step": 125 + }, + { + "advantage_max": 0.13201149785891175, + "advantage_mean": -2.9491882547194592e-09, + "advantage_min": -0.13687030225992203, + "advantage_std": 0.1064839765895158, + "completion_length": 2888.375030517578, + "epoch": 0.144, + "grad_norm": 0.021405896171927452, + "kl": 0.002410888671875, + "learning_rate": 9.381311511432658e-07, + "loss": 0.0082, + "reward": 0.029038145439699292, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10648397775366902, + "rewards/cosine_scaled_reward": -0.09103115275502205, + "rewards/format_reward": 0.35416668094694614, + "step": 126 + }, + { + "advantage_max": 0.18761903140693903, + "advantage_mean": 1.0865429944661997e-09, + "advantage_min": -0.09865566249936819, + "advantage_std": 0.11243651760742068, + "completion_length": 3327.7084045410156, + "epoch": 0.14514285714285713, + "grad_norm": 0.01991112157702446, + "kl": 0.005802154541015625, + "learning_rate": 9.36531953618799e-07, + "loss": 0.0053, + "reward": -0.03913277422543615, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.11243652272969484, + "rewards/cosine_scaled_reward": -0.21913862321525812, + "rewards/format_reward": 0.20833334140479565, + "step": 127 + }, + { + "advantage_max": 0.14964813645929098, + "advantage_mean": -5.743156020199258e-09, + "advantage_min": -0.2074049087241292, + "advantage_std": 0.1491677723824978, + "completion_length": 2978.166732788086, + "epoch": 0.1462857142857143, + "grad_norm": 0.02437690831720829, + "kl": 0.004235744476318359, + "learning_rate": 9.34913917072228e-07, + "loss": 0.0069, + "reward": 0.08901105728000402, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.14916778123006225, + "rewards/cosine_scaled_reward": 0.07575214840471745, + "rewards/format_reward": 0.3750000074505806, + "step": 128 + }, + { + "advantage_max": 0.16403791401535273, + "advantage_mean": 6.208817210362483e-10, + "advantage_min": -0.1340857520699501, + "advantage_std": 0.12814921559765935, + "completion_length": 3485.479217529297, + "epoch": 0.14742857142857144, + "grad_norm": 0.024071309715509415, + "kl": 0.004917621612548828, + "learning_rate": 9.332771203643714e-07, + "loss": 0.0058, + "reward": -0.0004819065798074007, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12814922630786896, + "rewards/cosine_scaled_reward": -0.09490467235445976, + "rewards/format_reward": 0.18750000186264515, + "step": 129 + }, + { + "advantage_max": 0.1427209312096238, + "advantage_mean": 3.104408632936817e-09, + "advantage_min": -0.08900717180222273, + "advantage_std": 0.08698507398366928, + "completion_length": 3171.6666717529297, + "epoch": 0.14857142857142858, + "grad_norm": 0.014160319231450558, + "kl": 0.0046808719635009766, + "learning_rate": 9.316216432703916e-07, + "loss": 0.002, + "reward": -0.03297149168793112, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.08698507212102413, + "rewards/cosine_scaled_reward": -0.19968184363096952, + "rewards/format_reward": 0.2083333358168602, + "step": 130 + }, + { + "advantage_max": 0.13422834686934948, + "advantage_mean": -1.1641533154138628e-09, + "advantage_min": -0.18032845202833414, + "advantage_std": 0.12928894069045782, + "completion_length": 2918.6875610351562, + "epoch": 0.14971428571428572, + "grad_norm": 0.0223796758800745, + "kl": 0.0055751800537109375, + "learning_rate": 9.299475664759068e-07, + "loss": 0.0089, + "reward": 0.07486723270267248, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12928894069045782, + "rewards/cosine_scaled_reward": 0.023864680901169777, + "rewards/format_reward": 0.39583334513008595, + "step": 131 + }, + { + "advantage_max": 0.16987570468336344, + "advantage_mean": -3.88051125954636e-10, + "advantage_min": -0.17349545564502478, + "advantage_std": 0.1324772317893803, + "completion_length": 2984.437515258789, + "epoch": 0.15085714285714286, + "grad_norm": 0.019351573660969734, + "kl": 0.004413604736328125, + "learning_rate": 9.282549715730579e-07, + "loss": 0.0084, + "reward": 0.05797156970947981, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13247722946107388, + "rewards/cosine_scaled_reward": -0.00954313576221466, + "rewards/format_reward": 0.35416667722165585, + "step": 132 + }, + { + "advantage_max": 0.1357247936539352, + "advantage_mean": 1.862645218619896e-09, + "advantage_min": -0.08459451515227556, + "advantage_std": 0.08351372461766005, + "completion_length": 3332.4583740234375, + "epoch": 0.152, + "grad_norm": 0.014467400498688221, + "kl": 0.005523681640625, + "learning_rate": 9.265439410565328e-07, + "loss": 0.0031, + "reward": -0.05285291757900268, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.08351372368633747, + "rewards/cosine_scaled_reward": -0.2597227357327938, + "rewards/format_reward": 0.20833333395421505, + "step": 133 + }, + { + "advantage_max": 0.2571664294227958, + "advantage_mean": -5.898376592705645e-09, + "advantage_min": -0.15844144020229578, + "advantage_std": 0.16976315109059215, + "completion_length": 2875.3750610351562, + "epoch": 0.15314285714285714, + "grad_norm": 0.028712719678878784, + "kl": 0.007971763610839844, + "learning_rate": 9.248145583195447e-07, + "loss": 0.0054, + "reward": 0.10160159273073077, + "reward_advantage_correlation": 1.0, + "reward_std": 0.16976316180080175, + "rewards/cosine_scaled_reward": 0.0412679030559957, + "rewards/format_reward": 0.5208333432674408, + "step": 134 + }, + { + "advantage_max": 0.17440959997475147, + "advantage_mean": -1.3969840007010959e-09, + "advantage_min": -0.175615637563169, + "advantage_std": 0.14522844285238534, + "completion_length": 2164.6458740234375, + "epoch": 0.15428571428571428, + "grad_norm": 0.026748182252049446, + "kl": 0.004391670227050781, + "learning_rate": 9.230669076497687e-07, + "loss": -0.0007, + "reward": 0.1501585068181157, + "reward_advantage_correlation": 1.0, + "reward_std": 0.14522844285238534, + "rewards/cosine_scaled_reward": 0.1208992125466466, + "rewards/format_reward": 0.6458333432674408, + "step": 135 + }, + { + "advantage_max": 0.18647493747994304, + "advantage_mean": -3.570070156166949e-09, + "advantage_min": -0.23410069476813078, + "advantage_std": 0.17950192606076598, + "completion_length": 3038.9583435058594, + "epoch": 0.15542857142857142, + "grad_norm": 0.0378388985991478, + "kl": 0.006519317626953125, + "learning_rate": 9.213010742252327e-07, + "loss": 0.0237, + "reward": 0.0896531674079597, + "reward_advantage_correlation": 1.0, + "reward_std": 0.179501932580024, + "rewards/cosine_scaled_reward": 0.07765412889420986, + "rewards/format_reward": 0.3750000111758709, + "step": 136 + }, + { + "advantage_max": 0.15427146770525724, + "advantage_mean": 2.1730860513824446e-09, + "advantage_min": -0.13093871576711535, + "advantage_std": 0.11751817003823817, + "completion_length": 3286.7708587646484, + "epoch": 0.15657142857142858, + "grad_norm": 0.02052266336977482, + "kl": 0.005681037902832031, + "learning_rate": 9.195171441101668e-07, + "loss": 0.0092, + "reward": -0.00857232604175806, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11751817702315748, + "rewards/cosine_scaled_reward": -0.1289244736544788, + "rewards/format_reward": 0.20833333767950535, + "step": 137 + }, + { + "advantage_max": 0.1545063005760312, + "advantage_mean": 4.190951738425319e-09, + "advantage_min": -0.09753599436953664, + "advantage_std": 0.10371883399784565, + "completion_length": 2524.375045776367, + "epoch": 0.15771428571428572, + "grad_norm": 0.013675352558493614, + "kl": 0.004207611083984375, + "learning_rate": 9.177152042508077e-07, + "loss": 0.0049, + "reward": 0.07395011962216813, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1037188358604908, + "rewards/cosine_scaled_reward": -0.064213783480227, + "rewards/format_reward": 0.562500013038516, + "step": 138 + }, + { + "advantage_max": 0.14760101400315762, + "advantage_mean": -1.241763476766966e-09, + "advantage_min": -0.16335713909938931, + "advantage_std": 0.13432236923836172, + "completion_length": 3093.229232788086, + "epoch": 0.15885714285714286, + "grad_norm": 0.026457417756319046, + "kl": 0.005748748779296875, + "learning_rate": 9.158953424711624e-07, + "loss": 0.009, + "reward": 0.0407895278185606, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13432237412780523, + "rewards/cosine_scaled_reward": -0.06742438767105341, + "rewards/format_reward": 0.3750000149011612, + "step": 139 + }, + { + "advantage_max": 0.09711257927119732, + "advantage_mean": -6.984918754504577e-10, + "advantage_min": -0.08666328061372042, + "advantage_std": 0.07689650449901819, + "completion_length": 3201.7083740234375, + "epoch": 0.16, + "grad_norm": 0.021007120609283447, + "kl": 0.010286331176757812, + "learning_rate": 9.140576474687263e-07, + "loss": 0.0091, + "reward": -0.013685423880815506, + "reward_advantage_correlation": 1.0, + "reward_std": 0.07689650217071176, + "rewards/cosine_scaled_reward": -0.1452522613108158, + "rewards/format_reward": 0.20833334140479565, + "step": 140 + }, + { + "advantage_max": 0.2189687853679061, + "advantage_mean": -2.0954758206404023e-09, + "advantage_min": -0.13264015363529325, + "advantage_std": 0.14607419818639755, + "completion_length": 2922.229248046875, + "epoch": 0.16114285714285714, + "grad_norm": 0.02560114860534668, + "kl": 0.0079193115234375, + "learning_rate": 9.122022088101613e-07, + "loss": 0.0077, + "reward": 0.030265355249866843, + "reward_advantage_correlation": 1.0, + "reward_std": 0.14607420284301043, + "rewards/cosine_scaled_reward": -0.1499252589419484, + "rewards/format_reward": 0.47916667349636555, + "step": 141 + }, + { + "advantage_max": 0.18417442869395018, + "advantage_mean": -3.570070017389071e-09, + "advantage_min": -0.16223066858947277, + "advantage_std": 0.1339187677949667, + "completion_length": 3167.104248046875, + "epoch": 0.16228571428571428, + "grad_norm": 0.020130103453993797, + "kl": 0.0071773529052734375, + "learning_rate": 9.103291169269299e-07, + "loss": 0.0164, + "reward": 0.06062847562134266, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13391877012327313, + "rewards/cosine_scaled_reward": -0.03922642639372498, + "rewards/format_reward": 0.43750001676380634, + "step": 142 + }, + { + "advantage_max": 0.12369767762720585, + "advantage_mean": -2.7163574167765603e-10, + "advantage_min": -0.10968521423637867, + "advantage_std": 0.09598812274634838, + "completion_length": 2993.0833435058594, + "epoch": 0.16342857142857142, + "grad_norm": 0.02057984471321106, + "kl": 0.007022857666015625, + "learning_rate": 9.084384631108882e-07, + "loss": 0.0107, + "reward": -0.01793865323998034, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09598812321200967, + "rewards/cosine_scaled_reward": -0.2098203683272004, + "rewards/format_reward": 0.31250000558793545, + "step": 143 + }, + { + "advantage_max": 0.24469326250255108, + "advantage_mean": 1.8626451353531692e-09, + "advantage_min": -0.16228821780532598, + "advantage_std": 0.1688228864222765, + "completion_length": 3053.1458740234375, + "epoch": 0.16457142857142856, + "grad_norm": 0.028377985581755638, + "kl": 0.006343841552734375, + "learning_rate": 9.065303395098358e-07, + "loss": 0.0125, + "reward": 0.02497765606676694, + "reward_advantage_correlation": 1.0, + "reward_std": 0.16882288735359907, + "rewards/cosine_scaled_reward": -0.08332122955471277, + "rewards/format_reward": 0.31250000558793545, + "step": 144 + }, + { + "advantage_max": 0.11938331183046103, + "advantage_mean": -3.182019189806873e-09, + "advantage_min": -0.10436953045427799, + "advantage_std": 0.0860091031063348, + "completion_length": 2103.9375534057617, + "epoch": 0.1657142857142857, + "grad_norm": 0.015253880061209202, + "kl": 0.004912376403808594, + "learning_rate": 9.046048391230247e-07, + "loss": 0.0093, + "reward": 0.13016253290697932, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.08600910496897995, + "rewards/cosine_scaled_reward": 0.08105293428525329, + "rewards/format_reward": 0.6041666772216558, + "step": 145 + }, + { + "advantage_max": 0.19676159508526325, + "advantage_mean": -2.0954758622737657e-09, + "advantage_min": -0.14002857124432921, + "advantage_std": 0.13696159655228257, + "completion_length": 2605.979202270508, + "epoch": 0.16685714285714287, + "grad_norm": 0.027135798707604408, + "kl": 0.004586696624755859, + "learning_rate": 9.026620557966279e-07, + "loss": 0.0149, + "reward": 0.050586492056027055, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13696160027757287, + "rewards/cosine_scaled_reward": -0.14402856817469, + "rewards/format_reward": 0.5833333395421505, + "step": 146 + }, + { + "advantage_max": 0.16726404940709472, + "advantage_mean": -4.5790028366243796e-09, + "advantage_min": -0.183818063698709, + "advantage_std": 0.1481722998432815, + "completion_length": 3017.0416870117188, + "epoch": 0.168, + "grad_norm": 0.032472483813762665, + "kl": 0.0078277587890625, + "learning_rate": 9.007020842191634e-07, + "loss": 0.015, + "reward": 0.05535583617165685, + "reward_advantage_correlation": 1.0, + "reward_std": 0.14817229937762022, + "rewards/cosine_scaled_reward": -0.0040863314643502235, + "rewards/format_reward": 0.3333333432674408, + "step": 147 + }, + { + "advantage_max": 0.1869575590826571, + "advantage_mean": -1.5522044760629683e-10, + "advantage_min": -0.16520546469837427, + "advantage_std": 0.1403520731255412, + "completion_length": 2839.854202270508, + "epoch": 0.16914285714285715, + "grad_norm": 0.0203824695199728, + "kl": 0.007618904113769531, + "learning_rate": 8.987250199168808e-07, + "loss": 0.0064, + "reward": 0.06426224764436483, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.14035206753760576, + "rewards/cosine_scaled_reward": -0.050383010879158974, + "rewards/format_reward": 0.47916667349636555, + "step": 148 + }, + { + "advantage_max": 0.14235155889764428, + "advantage_mean": -9.119200672369487e-10, + "advantage_min": -0.17431307956576347, + "advantage_std": 0.12154442211613059, + "completion_length": 2893.604217529297, + "epoch": 0.1702857142857143, + "grad_norm": 0.017824489623308182, + "kl": 0.004513263702392578, + "learning_rate": 8.967309592491052e-07, + "loss": 0.0019, + "reward": 0.08055121125653386, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12154442025348544, + "rewards/cosine_scaled_reward": -0.03182307630777359, + "rewards/format_reward": 0.5416666734963655, + "step": 149 + }, + { + "advantage_max": 0.19116142578423023, + "advantage_mean": -1.9402553730341054e-09, + "advantage_min": -0.15318272728472948, + "advantage_std": 0.14041123539209366, + "completion_length": 2982.8958740234375, + "epoch": 0.17142857142857143, + "grad_norm": 0.024410845711827278, + "kl": 0.008457183837890625, + "learning_rate": 8.9471999940354e-07, + "loss": 0.0147, + "reward": 0.037670310121029615, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1404112372547388, + "rewards/cosine_scaled_reward": -0.0773946214467287, + "rewards/format_reward": 0.3750000037252903, + "step": 150 + }, + { + "advantage_max": 0.13646899722516537, + "advantage_mean": -3.6476802203755376e-09, + "advantage_min": -0.177975757047534, + "advantage_std": 0.12031815620139241, + "completion_length": 2622.729217529297, + "epoch": 0.17257142857142857, + "grad_norm": 0.03814442828297615, + "kl": 0.007709503173828125, + "learning_rate": 8.926922383915315e-07, + "loss": 0.0096, + "reward": 0.09881546010728925, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.120318160392344, + "rewards/cosine_scaled_reward": 0.010256066918373108, + "rewards/format_reward": 0.5625000149011612, + "step": 151 + }, + { + "advantage_max": 0.17624704539775848, + "advantage_mean": 3.8805107738237865e-10, + "advantage_min": -0.11608144547790289, + "advantage_std": 0.1141918571665883, + "completion_length": 2824.2291984558105, + "epoch": 0.1737142857142857, + "grad_norm": 0.022418169304728508, + "kl": 0.006558418273925781, + "learning_rate": 8.906477750432903e-07, + "loss": 0.0045, + "reward": 0.01563113136216998, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.11419186135753989, + "rewards/cosine_scaled_reward": -0.15195966488681734, + "rewards/format_reward": 0.39583333395421505, + "step": 152 + }, + { + "advantage_max": 0.1550552500411868, + "advantage_mean": 1.7850349809389598e-09, + "advantage_min": -0.16724903974682093, + "advantage_std": 0.11843094322830439, + "completion_length": 3011.7916870117188, + "epoch": 0.17485714285714285, + "grad_norm": 0.02170824445784092, + "kl": 0.012271881103515625, + "learning_rate": 8.88586709003076e-07, + "loss": 0.0105, + "reward": 0.0336550869178609, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.11843094741925597, + "rewards/cosine_scaled_reward": -0.0872773714363575, + "rewards/format_reward": 0.37500001303851604, + "step": 153 + }, + { + "advantage_max": 0.2223479701206088, + "advantage_mean": 3.1044086745701804e-09, + "advantage_min": -0.1979465465992689, + "advantage_std": 0.171802272554487, + "completion_length": 3407.437530517578, + "epoch": 0.176, + "grad_norm": 0.02847815677523613, + "kl": 0.005352020263671875, + "learning_rate": 8.865091407243394e-07, + "loss": 0.0077, + "reward": 0.047635506991355214, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.17180227162316442, + "rewards/cosine_scaled_reward": -0.01420294726267457, + "rewards/format_reward": 0.3125000074505806, + "step": 154 + }, + { + "advantage_max": 0.15556193236261606, + "advantage_mean": -3.880511745268933e-10, + "advantage_min": -0.20040578301995993, + "advantage_std": 0.1497023869305849, + "completion_length": 2653.562545776367, + "epoch": 0.17714285714285713, + "grad_norm": 0.03856699913740158, + "kl": 0.005893707275390625, + "learning_rate": 8.844151714648274e-07, + "loss": 0.0166, + "reward": 0.08896010369062424, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.14970239624381065, + "rewards/cosine_scaled_reward": 0.03297489322721958, + "rewards/format_reward": 0.45833334513008595, + "step": 155 + }, + { + "advantage_max": 0.18007311457768083, + "advantage_mean": -1.241763414316921e-09, + "advantage_min": -0.14902439527213573, + "advantage_std": 0.13531371485441923, + "completion_length": 3162.687515258789, + "epoch": 0.1782857142857143, + "grad_norm": 0.017985651269555092, + "kl": 0.0062007904052734375, + "learning_rate": 8.823049032816478e-07, + "loss": 0.0054, + "reward": 0.019509871723130345, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13531372044235468, + "rewards/cosine_scaled_reward": -0.11034888960421085, + "rewards/format_reward": 0.3333333469927311, + "step": 156 + }, + { + "advantage_max": 0.13918431987985969, + "advantage_mean": -4.163336342344337e-17, + "advantage_min": -0.11121347360312939, + "advantage_std": 0.10173497628420591, + "completion_length": 3224.166717529297, + "epoch": 0.17942857142857144, + "grad_norm": 0.01812376268208027, + "kl": 0.00917816162109375, + "learning_rate": 8.801784390262943e-07, + "loss": 0.01, + "reward": -0.0007896313909441233, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10173497628420591, + "rewards/cosine_scaled_reward": -0.13752574939280748, + "rewards/format_reward": 0.2708333432674408, + "step": 157 + }, + { + "advantage_max": 0.257691353559494, + "advantage_mean": -4.035731332452386e-09, + "advantage_min": -0.17544407676905394, + "advantage_std": 0.17222883319482207, + "completion_length": 3024.125030517578, + "epoch": 0.18057142857142858, + "grad_norm": 0.027070507407188416, + "kl": 0.0061359405517578125, + "learning_rate": 8.780358823396352e-07, + "loss": 0.0161, + "reward": 0.16088842856697738, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.17222883319482207, + "rewards/cosine_scaled_reward": 0.20306535623967648, + "rewards/format_reward": 0.541666679084301, + "step": 158 + }, + { + "advantage_max": 0.1392697487026453, + "advantage_mean": -1.3193737005701145e-09, + "advantage_min": -0.10665746498852968, + "advantage_std": 0.10162637522444129, + "completion_length": 3238.625030517578, + "epoch": 0.18171428571428572, + "grad_norm": 0.019616369158029556, + "kl": 0.009876251220703125, + "learning_rate": 8.758773376468604e-07, + "loss": 0.0023, + "reward": -0.024257861077785492, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.10162637708708644, + "rewards/cosine_scaled_reward": -0.20666235126554966, + "rewards/format_reward": 0.27083334140479565, + "step": 159 + }, + { + "advantage_max": 0.16948407562449574, + "advantage_mean": -3.5700700312668587e-09, + "advantage_min": -0.13234250340610743, + "advantage_std": 0.12069881893694401, + "completion_length": 2822.208381652832, + "epoch": 0.18285714285714286, + "grad_norm": 0.022409770637750626, + "kl": 0.011167526245117188, + "learning_rate": 8.737029101523929e-07, + "loss": 0.0033, + "reward": 0.04982085805386305, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12069882079958916, + "rewards/cosine_scaled_reward": -0.04145237850025296, + "rewards/format_reward": 0.3750000111758709, + "step": 160 + }, + { + "advantage_max": 0.26078341249376535, + "advantage_mean": -4.190951766180895e-09, + "advantage_min": -0.20996790193021297, + "advantage_std": 0.18169856257736683, + "completion_length": 2801.5000610351562, + "epoch": 0.184, + "grad_norm": 0.03609248995780945, + "kl": 0.009645462036132812, + "learning_rate": 8.715127058347614e-07, + "loss": 0.0183, + "reward": 0.07784991223888937, + "reward_advantage_correlation": 1.0, + "reward_std": 0.18169856630265713, + "rewards/cosine_scaled_reward": -0.011576765216886997, + "rewards/format_reward": 0.47916668094694614, + "step": 161 + }, + { + "advantage_max": 0.14908618130721152, + "advantage_mean": -1.5522033658399437e-10, + "advantage_min": -0.13768692780286074, + "advantage_std": 0.12910312414169312, + "completion_length": 3373.8958435058594, + "epoch": 0.18514285714285714, + "grad_norm": 0.02396521158516407, + "kl": 0.012908935546875, + "learning_rate": 8.693068314414344e-07, + "loss": 0.0078, + "reward": -0.01704839337617159, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12910313392058015, + "rewards/cosine_scaled_reward": -0.1342112785205245, + "rewards/format_reward": 0.16666666977107525, + "step": 162 + }, + { + "advantage_max": 0.1578800524584949, + "advantage_mean": -3.4148495420271985e-09, + "advantage_min": -0.1815645396709442, + "advantage_std": 0.13664002949371934, + "completion_length": 2519.604232788086, + "epoch": 0.18628571428571428, + "grad_norm": 0.025771912187337875, + "kl": 0.009222030639648438, + "learning_rate": 8.670853944836176e-07, + "loss": 0.0139, + "reward": 0.15985322836786509, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13664002902805805, + "rewards/cosine_scaled_reward": 0.1906539173796773, + "rewards/format_reward": 0.562500013038516, + "step": 163 + }, + { + "advantage_max": 0.1497408151626587, + "advantage_mean": 3.8805109126016646e-10, + "advantage_min": -0.13764986861497164, + "advantage_std": 0.11453935131430626, + "completion_length": 2729.666732788086, + "epoch": 0.18742857142857142, + "grad_norm": 0.027371998876333237, + "kl": 0.008401870727539062, + "learning_rate": 8.648485032310144e-07, + "loss": 0.006, + "reward": 0.08568728528916836, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.11453934758901596, + "rewards/cosine_scaled_reward": 0.013143057469278574, + "rewards/format_reward": 0.4791666679084301, + "step": 164 + }, + { + "advantage_max": 0.2020930303260684, + "advantage_mean": 4.113341500744383e-09, + "advantage_min": -0.13421366550028324, + "advantage_std": 0.1295091542415321, + "completion_length": 3323.375030517578, + "epoch": 0.18857142857142858, + "grad_norm": 0.02407933585345745, + "kl": 0.013275146484375, + "learning_rate": 8.625962667065487e-07, + "loss": 0.0101, + "reward": 0.006851513287983835, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1295091570354998, + "rewards/cosine_scaled_reward": -0.1068816565675661, + "rewards/format_reward": 0.25000000186264515, + "step": 165 + }, + { + "advantage_max": 0.21237648325040936, + "advantage_mean": -1.629814540271557e-09, + "advantage_min": -0.15448179375380278, + "advantage_std": 0.1506841192021966, + "completion_length": 3147.666702270508, + "epoch": 0.18971428571428572, + "grad_norm": 0.02479720674455166, + "kl": 0.0067691802978515625, + "learning_rate": 8.603287946810513e-07, + "loss": 0.0051, + "reward": 0.013205445604398847, + "reward_advantage_correlation": 1.0, + "reward_std": 0.15068412572145462, + "rewards/cosine_scaled_reward": -0.1286666316445917, + "rewards/format_reward": 0.33333334140479565, + "step": 166 + }, + { + "advantage_max": 0.1873478158377111, + "advantage_mean": 4.6566125955216364e-10, + "advantage_min": -0.14719886984676123, + "advantage_std": 0.12714364705607295, + "completion_length": 2632.7083892822266, + "epoch": 0.19085714285714286, + "grad_norm": 0.019351521506905556, + "kl": 0.006518363952636719, + "learning_rate": 8.580461976679099e-07, + "loss": 0.0108, + "reward": 0.05125786177814007, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12714365031570196, + "rewards/cosine_scaled_reward": -0.13082036562263966, + "rewards/format_reward": 0.5625000111758709, + "step": 167 + }, + { + "advantage_max": 0.17462534084916115, + "advantage_mean": -4.346172061131526e-09, + "advantage_min": -0.1636114427819848, + "advantage_std": 0.1395610896870494, + "completion_length": 3228.1875610351562, + "epoch": 0.192, + "grad_norm": 0.02397916279733181, + "kl": 0.007595062255859375, + "learning_rate": 8.557485869176825e-07, + "loss": -0.003, + "reward": 0.05293558700941503, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13956109527498484, + "rewards/cosine_scaled_reward": -0.05299521051347256, + "rewards/format_reward": 0.4166666753590107, + "step": 168 + }, + { + "advantage_max": 0.14650389458984137, + "advantage_mean": -8.45951358269259e-09, + "advantage_min": -0.15014035161584616, + "advantage_std": 0.12220517976675183, + "completion_length": 2530.3542098999023, + "epoch": 0.19314285714285714, + "grad_norm": 0.02601473033428192, + "kl": 0.009768486022949219, + "learning_rate": 8.534360744126753e-07, + "loss": 0.0108, + "reward": 0.22324875311460346, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12220518512185663, + "rewards/cosine_scaled_reward": 0.32412082329392433, + "rewards/format_reward": 0.6666666716337204, + "step": 169 + }, + { + "advantage_max": 0.11422509420663118, + "advantage_mean": 8.925175043472677e-10, + "advantage_min": -0.12703087367117405, + "advantage_std": 0.1002190806902945, + "completion_length": 2565.562515258789, + "epoch": 0.19428571428571428, + "grad_norm": 0.01757214218378067, + "kl": 0.0067424774169921875, + "learning_rate": 8.511087728614862e-07, + "loss": 0.0112, + "reward": 0.07882664329372346, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10021908720955253, + "rewards/cosine_scaled_reward": 0.008724531158804893, + "rewards/format_reward": 0.43750000558793545, + "step": 170 + }, + { + "advantage_max": 0.18300288636237383, + "advantage_mean": -1.2417634975836478e-09, + "advantage_min": -0.14693648740649223, + "advantage_std": 0.12005474278703332, + "completion_length": 2807.2291870117188, + "epoch": 0.19542857142857142, + "grad_norm": 0.022502249106764793, + "kl": 0.006862640380859375, + "learning_rate": 8.487667956935087e-07, + "loss": 0.0131, + "reward": 0.07513996493071318, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12005474418401718, + "rewards/cosine_scaled_reward": 0.012896033469587564, + "rewards/format_reward": 0.41666667349636555, + "step": 171 + }, + { + "advantage_max": 0.16364847961813211, + "advantage_mean": -2.173086016687975e-09, + "advantage_min": -0.12421727832406759, + "advantage_std": 0.11818943079560995, + "completion_length": 2927.9791870117188, + "epoch": 0.19657142857142856, + "grad_norm": 0.021065451204776764, + "kl": 0.010840415954589844, + "learning_rate": 8.464102570534061e-07, + "loss": 0.0073, + "reward": 0.06200486654415727, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11818943126127124, + "rewards/cosine_scaled_reward": 0.02644550008699298, + "rewards/format_reward": 0.31250000186264515, + "step": 172 + }, + { + "advantage_max": 0.19667653134092689, + "advantage_mean": -4.229756791346295e-09, + "advantage_min": -0.16769259562715888, + "advantage_std": 0.1501930463127792, + "completion_length": 1986.583381652832, + "epoch": 0.1977142857142857, + "grad_norm": 0.023256558924913406, + "kl": 0.007579803466796875, + "learning_rate": 8.440392717955475e-07, + "loss": 0.0107, + "reward": 0.09084123687352985, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.15019305399619043, + "rewards/cosine_scaled_reward": -0.06845231936313212, + "rewards/format_reward": 0.6666666772216558, + "step": 173 + }, + { + "advantage_max": 0.19292704621329904, + "advantage_mean": -4.889443558364626e-09, + "advantage_min": -0.15875433292239904, + "advantage_std": 0.13439691066741943, + "completion_length": 2762.854232788086, + "epoch": 0.19885714285714284, + "grad_norm": 0.021673617884516716, + "kl": 0.011868476867675781, + "learning_rate": 8.416539554784089e-07, + "loss": 0.0079, + "reward": 0.06353488937020302, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1343969153240323, + "rewards/cosine_scaled_reward": -0.05013580992817879, + "rewards/format_reward": 0.4791666716337204, + "step": 174 + }, + { + "advantage_max": 0.12862909119576216, + "advantage_mean": -9.778887297140493e-09, + "advantage_min": -0.15183910354971886, + "advantage_std": 0.121172487270087, + "completion_length": 2763.083366394043, + "epoch": 0.2, + "grad_norm": 0.018218420445919037, + "kl": 0.00750732421875, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0052, + "reward": 0.0709586595185101, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.12117248913273215, + "rewards/cosine_scaled_reward": -0.020462360233068466, + "rewards/format_reward": 0.4583333358168602, + "step": 175 + }, + { + "advantage_max": 0.19677884504199028, + "advantage_mean": -2.1730863497548825e-09, + "advantage_min": -0.2427662005648017, + "advantage_std": 0.17106487229466438, + "completion_length": 2771.3542098999023, + "epoch": 0.20114285714285715, + "grad_norm": 0.0242843609303236, + "kl": 0.007083892822265625, + "learning_rate": 8.368407953869103e-07, + "loss": 0.0075, + "reward": 0.13187712128274143, + "reward_advantage_correlation": 1.0, + "reward_std": 0.17106487601995468, + "rewards/cosine_scaled_reward": 0.10790817299857736, + "rewards/format_reward": 0.562500013038516, + "step": 176 + }, + { + "advantage_max": 0.12549755768850446, + "advantage_mean": -2.0816681711721685e-17, + "advantage_min": -0.1870342567563057, + "advantage_std": 0.11075702356174588, + "completion_length": 3025.3334045410156, + "epoch": 0.2022857142857143, + "grad_norm": 0.02049132063984871, + "kl": 0.011358261108398438, + "learning_rate": 8.344131861991828e-07, + "loss": 0.0082, + "reward": 0.036965833278372884, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11075702914968133, + "rewards/cosine_scaled_reward": -0.1412050067447126, + "rewards/format_reward": 0.5000000223517418, + "step": 177 + }, + { + "advantage_max": 0.19278418272733688, + "advantage_mean": -4.190951682914168e-09, + "advantage_min": -0.18454215489327908, + "advantage_std": 0.15981952054426074, + "completion_length": 2930.416748046875, + "epoch": 0.20342857142857143, + "grad_norm": 0.02552485093474388, + "kl": 0.013797760009765625, + "learning_rate": 8.319717151140072e-07, + "loss": 0.0099, + "reward": 0.07667386531829834, + "reward_advantage_correlation": 1.0, + "reward_std": 0.15981952100992203, + "rewards/cosine_scaled_reward": -0.03615371882915497, + "rewards/format_reward": 0.5208333525806665, + "step": 178 + }, + { + "advantage_max": 0.15210868138819933, + "advantage_mean": -1.6298145541493447e-09, + "advantage_min": -0.11770202778279781, + "advantage_std": 0.1120380088686943, + "completion_length": 2897.062530517578, + "epoch": 0.20457142857142857, + "grad_norm": 0.021920261904597282, + "kl": 0.008077621459960938, + "learning_rate": 8.295165011252396e-07, + "loss": 0.0015, + "reward": 0.01579895243048668, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11203800793737173, + "rewards/cosine_scaled_reward": -0.16117261722683907, + "rewards/format_reward": 0.41666667349636555, + "step": 179 + }, + { + "advantage_max": 0.15735867712646723, + "advantage_mean": -6.713283928760916e-09, + "advantage_min": -0.1801003571599722, + "advantage_std": 0.12799114314839244, + "completion_length": 2355.6458892822266, + "epoch": 0.2057142857142857, + "grad_norm": 0.019921960309147835, + "kl": 0.01093292236328125, + "learning_rate": 8.270476638965461e-07, + "loss": -0.0007, + "reward": 0.13479173695668578, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.12799114966765046, + "rewards/cosine_scaled_reward": 0.10458490764722228, + "rewards/format_reward": 0.5833333432674408, + "step": 180 + }, + { + "advantage_max": 0.0872650584205985, + "advantage_mean": 1.6298145749660264e-09, + "advantage_min": -0.09466889966279268, + "advantage_std": 0.0786438356153667, + "completion_length": 3100.9583587646484, + "epoch": 0.20685714285714285, + "grad_norm": 0.017120540142059326, + "kl": 0.0089263916015625, + "learning_rate": 8.245653237555705e-07, + "loss": 0.0029, + "reward": 0.02593186777085066, + "reward_advantage_correlation": 1.0, + "reward_std": 0.07864383282139897, + "rewards/cosine_scaled_reward": -0.06834173109382391, + "rewards/format_reward": 0.29166667349636555, + "step": 181 + }, + { + "advantage_max": 0.13478928711265326, + "advantage_mean": -7.605801224941366e-09, + "advantage_min": -0.21524380147457123, + "advantage_std": 0.14336608722805977, + "completion_length": 2586.5208892822266, + "epoch": 0.208, + "grad_norm": 0.027521653100848198, + "kl": 0.00634765625, + "learning_rate": 8.220696016880687e-07, + "loss": 0.0057, + "reward": 0.14823864586651325, + "reward_advantage_correlation": 1.0, + "reward_std": 0.14336608815938234, + "rewards/cosine_scaled_reward": 0.13677118346095085, + "rewards/format_reward": 0.604166679084301, + "step": 182 + }, + { + "advantage_max": 0.18059027008712292, + "advantage_mean": -2.6387474982847436e-09, + "advantage_min": -0.17588150314986706, + "advantage_std": 0.14301183447241783, + "completion_length": 2779.6250915527344, + "epoch": 0.20914285714285713, + "grad_norm": 0.023653734475374222, + "kl": 0.013004302978515625, + "learning_rate": 8.195606193320136e-07, + "loss": 0.0143, + "reward": 0.04137046728283167, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.1430118391290307, + "rewards/cosine_scaled_reward": -0.10869285650551319, + "rewards/format_reward": 0.45833334140479565, + "step": 183 + }, + { + "advantage_max": 0.127640918828547, + "advantage_mean": 2.7939678071131624e-09, + "advantage_min": -0.1175808496773243, + "advantage_std": 0.10124421585351229, + "completion_length": 2929.5833740234375, + "epoch": 0.2102857142857143, + "grad_norm": 0.019127527251839638, + "kl": 0.009464263916015625, + "learning_rate": 8.170384989716657e-07, + "loss": 0.0029, + "reward": -0.02080875914543867, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10124421957880259, + "rewards/cosine_scaled_reward": -0.19583849888294935, + "rewards/format_reward": 0.2708333395421505, + "step": 184 + }, + { + "advantage_max": 0.15481803310103714, + "advantage_mean": -8.537123716290118e-10, + "advantage_min": -0.10665717558003962, + "advantage_std": 0.1069570422405377, + "completion_length": 2703.2500534057617, + "epoch": 0.21142857142857144, + "grad_norm": 0.02145558036863804, + "kl": 0.010293006896972656, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0095, + "reward": -0.010108587564900517, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10695704608224332, + "rewards/cosine_scaled_reward": -0.24899004492908716, + "rewards/format_reward": 0.4375000037252903, + "step": 185 + }, + { + "advantage_max": 0.12561852345243096, + "advantage_mean": 2.5611370246814147e-09, + "advantage_min": -0.16176388878375292, + "advantage_std": 0.11582700302824378, + "completion_length": 3060.604202270508, + "epoch": 0.21257142857142858, + "grad_norm": 0.020156843587756157, + "kl": 0.008787155151367188, + "learning_rate": 8.119553365707802e-07, + "loss": 0.0001, + "reward": 0.05811757780611515, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1158270058222115, + "rewards/cosine_scaled_reward": -0.01455057691782713, + "rewards/format_reward": 0.3750000111758709, + "step": 186 + }, + { + "advantage_max": 0.12376056425273418, + "advantage_mean": -1.0089327498463696e-09, + "advantage_min": -0.11852440610527992, + "advantage_std": 0.09066633740440011, + "completion_length": 2529.5625915527344, + "epoch": 0.21371428571428572, + "grad_norm": 0.020506154745817184, + "kl": 0.012113571166992188, + "learning_rate": 8.093945422764069e-07, + "loss": 0.0106, + "reward": 0.05328264785930514, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09066633740440011, + "rewards/cosine_scaled_reward": -0.10320662707090378, + "rewards/format_reward": 0.5208333469927311, + "step": 187 + }, + { + "advantage_max": 0.16143850050866604, + "advantage_mean": 1.0089328331130965e-09, + "advantage_min": -0.10339000448584557, + "advantage_std": 0.103767134482041, + "completion_length": 3437.2916870117188, + "epoch": 0.21485714285714286, + "grad_norm": 0.019638704136013985, + "kl": 0.009023666381835938, + "learning_rate": 8.068211054579943e-07, + "loss": 0.0035, + "reward": -0.034261735156178474, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10376713238656521, + "rewards/cosine_scaled_reward": -0.1733952183276415, + "rewards/format_reward": 0.1458333358168602, + "step": 188 + }, + { + "advantage_max": 0.0868607796728611, + "advantage_mean": -8.537123882823572e-09, + "advantage_min": -0.10214976128190756, + "advantage_std": 0.07864707754924893, + "completion_length": 2609.270881652832, + "epoch": 0.216, + "grad_norm": 0.016191232949495316, + "kl": 0.007904052734375, + "learning_rate": 8.04235151541222e-07, + "loss": 0.0096, + "reward": 0.09003830677829683, + "reward_advantage_correlation": 1.0, + "reward_std": 0.07864707987755537, + "rewards/cosine_scaled_reward": 0.014717839658260345, + "rewards/format_reward": 0.5000000037252903, + "step": 189 + }, + { + "advantage_max": 0.14429873740300536, + "advantage_mean": -2.4835268772060992e-09, + "advantage_min": -0.12300875596702099, + "advantage_std": 0.10701592592522502, + "completion_length": 2838.5208740234375, + "epoch": 0.21714285714285714, + "grad_norm": 0.016013866290450096, + "kl": 0.01021575927734375, + "learning_rate": 8.01636806561836e-07, + "loss": 0.0091, + "reward": 0.05636314395815134, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10701592825353146, + "rewards/cosine_scaled_reward": -0.021093813586048782, + "rewards/format_reward": 0.37500000931322575, + "step": 190 + }, + { + "advantage_max": 0.19808372668921947, + "advantage_mean": -5.665545754762746e-09, + "advantage_min": -0.18589732144027948, + "advantage_std": 0.1561442338861525, + "completion_length": 2586.6875076293945, + "epoch": 0.21828571428571428, + "grad_norm": 0.031619712710380554, + "kl": 0.010101318359375, + "learning_rate": 7.990261971595048e-07, + "loss": 0.0154, + "reward": 0.089652857510373, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1561442413367331, + "rewards/cosine_scaled_reward": 0.012603958894032985, + "rewards/format_reward": 0.5000000055879354, + "step": 191 + }, + { + "advantage_max": 0.1139423786662519, + "advantage_mean": -1.5522042262627878e-09, + "advantage_min": -0.10976849030703306, + "advantage_std": 0.09138505253940821, + "completion_length": 3170.791717529297, + "epoch": 0.21942857142857142, + "grad_norm": 0.01777508296072483, + "kl": 0.008083343505859375, + "learning_rate": 7.964034505716476e-07, + "loss": 0.0057, + "reward": 0.01685933256521821, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09138506185263395, + "rewards/cosine_scaled_reward": -0.09557726047933102, + "rewards/format_reward": 0.2916666716337204, + "step": 192 + }, + { + "advantage_max": 0.14380517834797502, + "advantage_mean": -3.1044090909038147e-10, + "advantage_min": -0.13345478381961584, + "advantage_std": 0.10799073707312346, + "completion_length": 3189.750045776367, + "epoch": 0.22057142857142858, + "grad_norm": 0.01901032216846943, + "kl": 0.010015487670898438, + "learning_rate": 7.93768694627233e-07, + "loss": 0.0091, + "reward": 0.02204875904135406, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10799074079841375, + "rewards/cosine_scaled_reward": -0.11225971346721053, + "rewards/format_reward": 0.35416667722165585, + "step": 193 + }, + { + "advantage_max": 0.15160069148987532, + "advantage_mean": -4.190951627403017e-09, + "advantage_min": -0.1970332907512784, + "advantage_std": 0.1322903553955257, + "completion_length": 3056.916717529297, + "epoch": 0.22171428571428572, + "grad_norm": 0.021130474284291267, + "kl": 0.008333206176757812, + "learning_rate": 7.911220577405484e-07, + "loss": 0.0085, + "reward": 0.15966177079826593, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13229035586118698, + "rewards/cosine_scaled_reward": 0.20346511714160442, + "rewards/format_reward": 0.541666679084301, + "step": 194 + }, + { + "advantage_max": 0.21137044485658407, + "advantage_mean": -2.017865666226193e-09, + "advantage_min": -0.18295289110392332, + "advantage_std": 0.16112522408366203, + "completion_length": 3007.3125610351562, + "epoch": 0.22285714285714286, + "grad_norm": 0.02642098069190979, + "kl": 0.008836746215820312, + "learning_rate": 7.884636689049422e-07, + "loss": 0.0125, + "reward": 0.07984138361644, + "reward_advantage_correlation": 1.0, + "reward_std": 0.16112523339688778, + "rewards/cosine_scaled_reward": -0.01430131122469902, + "rewards/format_reward": 0.5000000149011612, + "step": 195 + }, + { + "advantage_max": 0.11525858240202069, + "advantage_mean": 1.862645163108745e-09, + "advantage_min": -0.11348771117627621, + "advantage_std": 0.09375560656189919, + "completion_length": 3303.4166870117188, + "epoch": 0.224, + "grad_norm": 0.021477309986948967, + "kl": 0.0077724456787109375, + "learning_rate": 7.857936576865356e-07, + "loss": 0.0007, + "reward": 0.050928775453940034, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.09375560656189919, + "rewards/cosine_scaled_reward": -0.02439815178513527, + "rewards/format_reward": 0.35416666977107525, + "step": 196 + }, + { + "advantage_max": 0.3016190994530916, + "advantage_mean": -7.605801224941366e-09, + "advantage_min": -0.20797276590019464, + "advantage_std": 0.2091372339054942, + "completion_length": 2295.3333587646484, + "epoch": 0.22514285714285714, + "grad_norm": 0.03227706253528595, + "kl": 0.008520126342773438, + "learning_rate": 7.831121542179086e-07, + "loss": 0.0104, + "reward": 0.11764910374768078, + "reward_advantage_correlation": 1.0, + "reward_std": 0.2091372385621071, + "rewards/cosine_scaled_reward": 0.04368517640978098, + "rewards/format_reward": 0.604166679084301, + "step": 197 + }, + { + "advantage_max": 0.15463142562657595, + "advantage_mean": -4.035731401841325e-09, + "advantage_min": -0.1649370063096285, + "advantage_std": 0.12568959640339017, + "completion_length": 2766.3750610351562, + "epoch": 0.22628571428571428, + "grad_norm": 0.023209044709801674, + "kl": 0.011608123779296875, + "learning_rate": 7.804192891917571e-07, + "loss": 0.011, + "reward": 0.0660445298999548, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12568959454074502, + "rewards/cosine_scaled_reward": -0.045943474397063255, + "rewards/format_reward": 0.47916668094694614, + "step": 198 + }, + { + "advantage_max": 0.14701731782406569, + "advantage_mean": -2.1730860721991263e-09, + "advantage_min": -0.14183137379586697, + "advantage_std": 0.11165185179561377, + "completion_length": 3187.5208435058594, + "epoch": 0.22742857142857142, + "grad_norm": 0.022100677713751793, + "kl": 0.012798309326171875, + "learning_rate": 7.777151938545235e-07, + "loss": 0.0058, + "reward": -0.011955318361287937, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1116518541239202, + "rewards/cosine_scaled_reward": -0.1814872338436544, + "rewards/format_reward": 0.2916666753590107, + "step": 199 + }, + { + "advantage_max": 0.22152548655867577, + "advantage_mean": -3.5700699618779197e-09, + "advantage_min": -0.17661806754767895, + "advantage_std": 0.16029490064829588, + "completion_length": 2475.854217529297, + "epoch": 0.22857142857142856, + "grad_norm": 0.0255091842263937, + "kl": 0.0064983367919921875, + "learning_rate": 7.75e-07, + "loss": 0.0114, + "reward": 0.12036499596433714, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1602949034422636, + "rewards/cosine_scaled_reward": 0.08125638961791992, + "rewards/format_reward": 0.5416666772216558, + "step": 200 + }, + { + "advantage_max": 0.23342143837362528, + "advantage_mean": -6.208817404651512e-09, + "advantage_min": -0.21431293059140444, + "advantage_std": 0.18895365437492728, + "completion_length": 2669.520851135254, + "epoch": 0.2297142857142857, + "grad_norm": 0.030897650867700577, + "kl": 0.0077991485595703125, + "learning_rate": 7.72273839962904e-07, + "loss": 0.0155, + "reward": 0.1639406383037567, + "reward_advantage_correlation": 1.0, + "reward_std": 0.18895365856587887, + "rewards/cosine_scaled_reward": 0.18311863904818892, + "rewards/format_reward": 0.6041666772216558, + "step": 201 + }, + { + "advantage_max": 0.08613977860659361, + "advantage_mean": -4.346172047253738e-09, + "advantage_min": -0.10160630848258734, + "advantage_std": 0.0818297709338367, + "completion_length": 2086.8333587646484, + "epoch": 0.23085714285714284, + "grad_norm": 0.011819848790764809, + "kl": 0.005578041076660156, + "learning_rate": 7.695368466124296e-07, + "loss": -0.0019, + "reward": 0.18190485704690218, + "reward_advantage_correlation": 1.0, + "reward_std": 0.0818297709338367, + "rewards/cosine_scaled_reward": 0.22224061330780387, + "rewards/format_reward": 0.625, + "step": 202 + }, + { + "advantage_max": 0.19074973743408918, + "advantage_mean": 6.208817349140361e-10, + "advantage_min": -0.09372275089845061, + "advantage_std": 0.11288550030440092, + "completion_length": 3141.375030517578, + "epoch": 0.232, + "grad_norm": 0.021399665623903275, + "kl": 0.0112152099609375, + "learning_rate": 7.667891533457718e-07, + "loss": 0.0082, + "reward": 0.017376512056216598, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11288550635799766, + "rewards/cosine_scaled_reward": -0.0945606417953968, + "rewards/format_reward": 0.2916666716337204, + "step": 203 + }, + { + "advantage_max": 0.16842536255717278, + "advantage_mean": -2.949188337986186e-09, + "advantage_min": -0.19366966281086206, + "advantage_std": 0.14565699081867933, + "completion_length": 2458.7292098999023, + "epoch": 0.23314285714285715, + "grad_norm": 0.033364247530698776, + "kl": 0.01160430908203125, + "learning_rate": 7.640308940816239e-07, + "loss": 0.0215, + "reward": 0.0991232428496005, + "reward_advantage_correlation": 1.0, + "reward_std": 0.14565699454396963, + "rewards/cosine_scaled_reward": -0.021544933319091797, + "rewards/format_reward": 0.6250000204890966, + "step": 204 + }, + { + "advantage_max": 0.22404637094587088, + "advantage_mean": -4.035731443474688e-09, + "advantage_min": -0.23686167504638433, + "advantage_std": 0.18611570354551077, + "completion_length": 2529.187530517578, + "epoch": 0.2342857142857143, + "grad_norm": 0.04161351919174194, + "kl": 0.0066680908203125, + "learning_rate": 7.612622032536507e-07, + "loss": 0.0148, + "reward": 0.14069395791739225, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1861157095991075, + "rewards/cosine_scaled_reward": 0.10330121964216232, + "rewards/format_reward": 0.625000013038516, + "step": 205 + }, + { + "advantage_max": 0.191861386410892, + "advantage_mean": 1.8626451908643205e-09, + "advantage_min": -0.09568049665540457, + "advantage_std": 0.11434536194428802, + "completion_length": 3076.7917404174805, + "epoch": 0.23542857142857143, + "grad_norm": 0.017068374902009964, + "kl": 0.007312774658203125, + "learning_rate": 7.584832158039378e-07, + "loss": 0.005, + "reward": -0.04568293271586299, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11434536380693316, + "rewards/cosine_scaled_reward": -0.2805359214544296, + "rewards/format_reward": 0.2916666716337204, + "step": 206 + }, + { + "advantage_max": 0.17041416559368372, + "advantage_mean": -2.9491882408416714e-09, + "advantage_min": -0.15167315676808357, + "advantage_std": 0.1317178774625063, + "completion_length": 2986.6875610351562, + "epoch": 0.23657142857142857, + "grad_norm": 0.019738713279366493, + "kl": 0.013507843017578125, + "learning_rate": 7.556940671764124e-07, + "loss": 0.0111, + "reward": 0.02802197606069967, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.13171787792816758, + "rewards/cosine_scaled_reward": -0.1678624264895916, + "rewards/format_reward": 0.5000000093132257, + "step": 207 + }, + { + "advantage_max": 0.12179180048406124, + "advantage_mean": 1.6298144223103606e-09, + "advantage_min": -0.12574960058555007, + "advantage_std": 0.09972883993759751, + "completion_length": 2369.2708892822266, + "epoch": 0.2377142857142857, + "grad_norm": 0.019418170675635338, + "kl": 0.006829261779785156, + "learning_rate": 7.528948933102438e-07, + "loss": 0.0023, + "reward": 0.10339555609971285, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09972884552553296, + "rewards/cosine_scaled_reward": 0.014620641246438026, + "rewards/format_reward": 0.583333333954215, + "step": 208 + }, + { + "advantage_max": 0.14903791062533855, + "advantage_mean": -3.880511467713177e-10, + "advantage_min": -0.09558573551476002, + "advantage_std": 0.09572446253150702, + "completion_length": 2642.020835876465, + "epoch": 0.23885714285714285, + "grad_norm": 0.015551680698990822, + "kl": 0.009195327758789062, + "learning_rate": 7.500858306332172e-07, + "loss": 0.0058, + "reward": 0.0819700972060673, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.09572446579113603, + "rewards/cosine_scaled_reward": -0.007855089381337166, + "rewards/format_reward": 0.5000000018626451, + "step": 209 + }, + { + "advantage_max": 0.20507059153169394, + "advantage_mean": -1.2417633865613453e-09, + "advantage_min": -0.14836034085601568, + "advantage_std": 0.12791450042277575, + "completion_length": 2679.5208892822266, + "epoch": 0.24, + "grad_norm": 0.013562957756221294, + "kl": 0.0076656341552734375, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0103, + "reward": 0.06892588455229998, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12791450135409832, + "rewards/cosine_scaled_reward": -0.0150822380091995, + "rewards/format_reward": 0.4375000074505806, + "step": 210 + }, + { + "advantage_max": 0.08701092284172773, + "advantage_mean": -1.6298144847604057e-09, + "advantage_min": -0.06568480283021927, + "advantage_std": 0.05974301462993026, + "completion_length": 2295.4583587646484, + "epoch": 0.24114285714285713, + "grad_norm": 0.010742315091192722, + "kl": 0.008317947387695312, + "learning_rate": 7.444385869608921e-07, + "loss": 0.0027, + "reward": 0.0884767509996891, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.05974301369860768, + "rewards/cosine_scaled_reward": -0.061297111213207245, + "rewards/format_reward": 0.6458333395421505, + "step": 211 + }, + { + "advantage_max": 0.1430717734619975, + "advantage_mean": -8.84756487518068e-09, + "advantage_min": -0.15477489028126, + "advantage_std": 0.11844521341845393, + "completion_length": 2305.875015258789, + "epoch": 0.2422857142857143, + "grad_norm": 0.022116854786872864, + "kl": 0.008672714233398438, + "learning_rate": 7.416006812042827e-07, + "loss": 0.0115, + "reward": 0.11736472509801388, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11844521760940552, + "rewards/cosine_scaled_reward": 0.03006414882838726, + "rewards/format_reward": 0.6250000093132257, + "step": 212 + }, + { + "advantage_max": 0.21030229609459639, + "advantage_mean": -1.0865430638551388e-09, + "advantage_min": -0.1627835095860064, + "advantage_std": 0.14803386572748423, + "completion_length": 2458.354202270508, + "epoch": 0.24342857142857144, + "grad_norm": 0.02114744670689106, + "kl": 0.013973236083984375, + "learning_rate": 7.387534371007797e-07, + "loss": 0.0078, + "reward": 0.09445515216793865, + "reward_advantage_correlation": 1.0, + "reward_std": 0.14803387317806482, + "rewards/cosine_scaled_reward": -0.003314238041639328, + "rewards/format_reward": 0.5625000055879354, + "step": 213 + }, + { + "advantage_max": 0.16307567991316319, + "advantage_mean": 3.8805106350459084e-10, + "advantage_min": -0.1880338666960597, + "advantage_std": 0.13389361603185534, + "completion_length": 2575.6875762939453, + "epoch": 0.24457142857142858, + "grad_norm": 0.02633727341890335, + "kl": 0.008653640747070312, + "learning_rate": 7.358969934210438e-07, + "loss": 0.0085, + "reward": 0.1364652120973915, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1338936211541295, + "rewards/cosine_scaled_reward": 0.07814164273440838, + "rewards/format_reward": 0.6458333488553762, + "step": 214 + }, + { + "advantage_max": 0.11850057449191809, + "advantage_mean": 2.6387473733846534e-09, + "advantage_min": -0.1467567002400756, + "advantage_std": 0.10804584342986345, + "completion_length": 2357.2708587646484, + "epoch": 0.24571428571428572, + "grad_norm": 0.016689570620656013, + "kl": 0.0051898956298828125, + "learning_rate": 7.330314893841101e-07, + "loss": 0.0116, + "reward": 0.09326311138829624, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10804584808647633, + "rewards/cosine_scaled_reward": -0.05246574338525534, + "rewards/format_reward": 0.6458333469927311, + "step": 215 + }, + { + "advantage_max": 0.17286218609660864, + "advantage_mean": -2.0178656801039807e-09, + "advantage_min": -0.2057242812588811, + "advantage_std": 0.15920248720794916, + "completion_length": 2195.8958587646484, + "epoch": 0.24685714285714286, + "grad_norm": 0.028986535966396332, + "kl": 0.009554386138916016, + "learning_rate": 7.301570646506027e-07, + "loss": 0.015, + "reward": 0.08979062891739886, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1592024890705943, + "rewards/cosine_scaled_reward": -0.01833750121295452, + "rewards/format_reward": 0.5625000111758709, + "step": 216 + }, + { + "advantage_max": 0.2062919419258833, + "advantage_mean": 3.8805107738237865e-10, + "advantage_min": -0.16623112093657255, + "advantage_std": 0.15084476629272103, + "completion_length": 2666.6875915527344, + "epoch": 0.248, + "grad_norm": 0.022731564939022064, + "kl": 0.008672714233398438, + "learning_rate": 7.27273859315928e-07, + "loss": 0.0027, + "reward": 0.06434820429421961, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.15084476908668876, + "rewards/cosine_scaled_reward": -0.05128720495849848, + "rewards/format_reward": 0.47916666977107525, + "step": 217 + }, + { + "advantage_max": 0.2643870050087571, + "advantage_mean": -1.668619659112025e-09, + "advantage_min": -0.1602165149524808, + "advantage_std": 0.16815320495516062, + "completion_length": 2753.666748046875, + "epoch": 0.24914285714285714, + "grad_norm": 0.023222601041197777, + "kl": 0.01018524169921875, + "learning_rate": 7.243820139034464e-07, + "loss": 0.0127, + "reward": 0.05022087972611189, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.16815320681780577, + "rewards/cosine_scaled_reward": -0.07070112135261297, + "rewards/format_reward": 0.4375000074505806, + "step": 218 + }, + { + "advantage_max": 0.18152069114148617, + "advantage_mean": -1.8626452463754717e-09, + "advantage_min": -0.1738226441666484, + "advantage_std": 0.13549907505512238, + "completion_length": 2355.1042251586914, + "epoch": 0.2502857142857143, + "grad_norm": 0.022829996421933174, + "kl": 0.011911392211914062, + "learning_rate": 7.214816693576234e-07, + "loss": 0.0078, + "reward": 0.11652564397081733, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13549907878041267, + "rewards/cosine_scaled_reward": 0.040676600649021566, + "rewards/format_reward": 0.6041666734963655, + "step": 219 + }, + { + "advantage_max": 0.0978488284163177, + "advantage_mean": -8.537123716290118e-10, + "advantage_min": -0.1063184947706759, + "advantage_std": 0.07688094722107053, + "completion_length": 2560.625045776367, + "epoch": 0.25142857142857145, + "grad_norm": 0.010847953148186207, + "kl": 0.0134124755859375, + "learning_rate": 7.185729670371604e-07, + "loss": 0.0068, + "reward": 0.002609904622659087, + "reward_advantage_correlation": 1.0, + "reward_std": 0.07688094908371568, + "rewards/cosine_scaled_reward": -0.22321993205696344, + "rewards/format_reward": 0.4583333469927311, + "step": 220 + }, + { + "advantage_max": 0.12692446261644363, + "advantage_mean": -5.587935586470749e-09, + "advantage_min": -0.1331609645858407, + "advantage_std": 0.09573783411178738, + "completion_length": 1943.9792098999023, + "epoch": 0.25257142857142856, + "grad_norm": 0.01251673512160778, + "kl": 0.005926609039306641, + "learning_rate": 7.156560487081051e-07, + "loss": 0.0098, + "reward": 0.11415702244266868, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.09573783841915429, + "rewards/cosine_scaled_reward": 0.003712208941578865, + "rewards/format_reward": 0.6666666734963655, + "step": 221 + }, + { + "advantage_max": 0.11880875332280993, + "advantage_mean": -7.372970567409709e-10, + "advantage_min": -0.09530603419989347, + "advantage_std": 0.08716670400463045, + "completion_length": 2160.687515258789, + "epoch": 0.2537142857142857, + "grad_norm": 0.015030966140329838, + "kl": 0.005939483642578125, + "learning_rate": 7.127310565369415e-07, + "loss": 0.0025, + "reward": 0.16150327073410153, + "reward_advantage_correlation": 1.0, + "reward_std": 0.08716670540161431, + "rewards/cosine_scaled_reward": 0.15005341079086065, + "rewards/format_reward": 0.645833333954215, + "step": 222 + }, + { + "advantage_max": 0.12176577933132648, + "advantage_mean": -2.328306450416484e-09, + "advantage_min": -0.1717464942485094, + "advantage_std": 0.11729715252295136, + "completion_length": 2503.500045776367, + "epoch": 0.25485714285714284, + "grad_norm": 0.019054196774959564, + "kl": 0.006908416748046875, + "learning_rate": 7.097981330836616e-07, + "loss": 0.0064, + "reward": 0.11482558376155794, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.11729715345427394, + "rewards/cosine_scaled_reward": 0.03707345947623253, + "rewards/format_reward": 0.6041666753590107, + "step": 223 + }, + { + "advantage_max": 0.15128592168912292, + "advantage_mean": 9.313226578822054e-10, + "advantage_min": -0.12854185421019793, + "advantage_std": 0.11887390678748488, + "completion_length": 2807.4166870117188, + "epoch": 0.256, + "grad_norm": 0.019626790657639503, + "kl": 0.011783599853515625, + "learning_rate": 7.068574212948169e-07, + "loss": 0.0043, + "reward": 0.02313301805406809, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.11887391144409776, + "rewards/cosine_scaled_reward": -0.1292988988570869, + "rewards/format_reward": 0.3958333395421505, + "step": 224 + }, + { + "advantage_max": 0.14465034054592252, + "advantage_mean": 2.3283067140944524e-10, + "advantage_min": -0.17112653935328126, + "advantage_std": 0.13964845798909664, + "completion_length": 2912.1042404174805, + "epoch": 0.2571428571428571, + "grad_norm": 0.031687695533037186, + "kl": 0.010837554931640625, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0072, + "reward": 0.06882307189516723, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1396484593860805, + "rewards/cosine_scaled_reward": -0.025214084424078465, + "rewards/format_reward": 0.45833334140479565, + "step": 225 + }, + { + "advantage_max": 0.15410880697891116, + "advantage_mean": -6.208817203423589e-09, + "advantage_min": -0.1973248701542616, + "advantage_std": 0.13895932119339705, + "completion_length": 2613.041717529297, + "epoch": 0.2582857142857143, + "grad_norm": 0.02550850808620453, + "kl": 0.012088775634765625, + "learning_rate": 7.009532063876148e-07, + "loss": 0.0088, + "reward": 0.11753622768446803, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13895932817831635, + "rewards/cosine_scaled_reward": 0.09499787259846926, + "rewards/format_reward": 0.5000000055879354, + "step": 226 + }, + { + "advantage_max": 0.20522056613117456, + "advantage_mean": -5.432714417219486e-10, + "advantage_min": -0.13819229789078236, + "advantage_std": 0.13077273219823837, + "completion_length": 2225.187545776367, + "epoch": 0.25942857142857145, + "grad_norm": 0.02267509512603283, + "kl": 0.014047622680664062, + "learning_rate": 6.979899910323624e-07, + "loss": 0.0049, + "reward": 0.06590676098130643, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13077273732051253, + "rewards/cosine_scaled_reward": -0.14173524361103773, + "rewards/format_reward": 0.6666666772216558, + "step": 227 + }, + { + "advantage_max": 0.10316049680113792, + "advantage_mean": -9.002785308909189e-09, + "advantage_min": -0.1567352144047618, + "advantage_std": 0.1106303846463561, + "completion_length": 2026.0833473205566, + "epoch": 0.26057142857142856, + "grad_norm": 0.019735384732484818, + "kl": 0.0073490142822265625, + "learning_rate": 6.950195628537299e-07, + "loss": 0.0142, + "reward": 0.12237067025853321, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11063039023429155, + "rewards/cosine_scaled_reward": 0.06896073557436466, + "rewards/format_reward": 0.5833333376795053, + "step": 228 + }, + { + "advantage_max": 0.14611426461488008, + "advantage_mean": -1.862645218619896e-09, + "advantage_min": -0.11763771809637547, + "advantage_std": 0.11251156777143478, + "completion_length": 3169.4791717529297, + "epoch": 0.26171428571428573, + "grad_norm": 0.023638132959604263, + "kl": 0.013431549072265625, + "learning_rate": 6.920420666261961e-07, + "loss": 0.003, + "reward": 0.07627941074315459, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11251156777143478, + "rewards/cosine_scaled_reward": 0.06778724677860737, + "rewards/format_reward": 0.31250000558793545, + "step": 229 + }, + { + "advantage_max": 0.20736990496516228, + "advantage_mean": 1.396983924373263e-09, + "advantage_min": -0.1171214496716857, + "advantage_std": 0.12726245261728764, + "completion_length": 3142.541717529297, + "epoch": 0.26285714285714284, + "grad_norm": 0.02262984775006771, + "kl": 0.01515960693359375, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0136, + "reward": -0.014790376415476203, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12726245447993279, + "rewards/cosine_scaled_reward": -0.2001241371035576, + "rewards/format_reward": 0.31250000931322575, + "step": 230 + }, + { + "advantage_max": 0.1515338383615017, + "advantage_mean": 2.0178655690816782e-09, + "advantage_min": -0.16625231131911278, + "advantage_std": 0.12852926715277135, + "completion_length": 2640.020866394043, + "epoch": 0.264, + "grad_norm": 0.02330903708934784, + "kl": 0.011930465698242188, + "learning_rate": 6.860664508377001e-07, + "loss": 0.0066, + "reward": 0.10281063965521753, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1285292694810778, + "rewards/cosine_scaled_reward": 0.04173674341291189, + "rewards/format_reward": 0.5208333469927311, + "step": 231 + }, + { + "advantage_max": 0.20202800119295716, + "advantage_mean": -1.3969838896787934e-09, + "advantage_min": -0.13641920872032642, + "advantage_std": 0.12839107867330313, + "completion_length": 3155.4583892822266, + "epoch": 0.2651428571428571, + "grad_norm": 0.02304697036743164, + "kl": 0.011341094970703125, + "learning_rate": 6.83068622519821e-07, + "loss": 0.0023, + "reward": 0.0004285484756110236, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12839107774198055, + "rewards/cosine_scaled_reward": -0.1556595927104354, + "rewards/format_reward": 0.31250000186264515, + "step": 232 + }, + { + "advantage_max": 0.207712696865201, + "advantage_mean": -1.2417634698280722e-09, + "advantage_min": -0.1607958972454071, + "advantage_std": 0.13828255608677864, + "completion_length": 2502.68754196167, + "epoch": 0.2662857142857143, + "grad_norm": 0.031160853803157806, + "kl": 0.012420654296875, + "learning_rate": 6.800643086250121e-07, + "loss": 0.0161, + "reward": 0.04692278621951118, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1382825607433915, + "rewards/cosine_scaled_reward": -0.1229497455060482, + "rewards/format_reward": 0.520833345130086, + "step": 233 + }, + { + "advantage_max": 0.13861850136891007, + "advantage_mean": -2.638747317873502e-09, + "advantage_min": -0.13130635116249323, + "advantage_std": 0.1169007895514369, + "completion_length": 2610.875045776367, + "epoch": 0.2674285714285714, + "grad_norm": 0.01976979523897171, + "kl": 0.009636878967285156, + "learning_rate": 6.770536555792944e-07, + "loss": 0.0069, + "reward": 0.07872189255431294, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.11690079234540462, + "rewards/cosine_scaled_reward": -0.007438613101840019, + "rewards/format_reward": 0.4791666716337204, + "step": 234 + }, + { + "advantage_max": 0.13972541643306613, + "advantage_mean": -5.665545844968367e-09, + "advantage_min": -0.1696104733273387, + "advantage_std": 0.12484075641259551, + "completion_length": 2025.4583892822266, + "epoch": 0.26857142857142857, + "grad_norm": 0.027829406782984734, + "kl": 0.012134552001953125, + "learning_rate": 6.740368101176495e-07, + "loss": 0.012, + "reward": 0.18675878643989563, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12484075920656323, + "rewards/cosine_scaled_reward": 0.2382234064862132, + "rewards/format_reward": 0.6250000074505806, + "step": 235 + }, + { + "advantage_max": 0.17086784075945616, + "advantage_mean": -6.926711669519303e-09, + "advantage_min": -0.16489312052726746, + "advantage_std": 0.12884586374275386, + "completion_length": 2521.8334045410156, + "epoch": 0.26971428571428574, + "grad_norm": 0.01972981169819832, + "kl": 0.008334159851074219, + "learning_rate": 6.710139192768694e-07, + "loss": 0.0013, + "reward": 0.12938043719623238, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12884586746804416, + "rewards/cosine_scaled_reward": 0.0801224485039711, + "rewards/format_reward": 0.6041666753590107, + "step": 236 + }, + { + "advantage_max": 0.12039782106876373, + "advantage_mean": -1.7074246044801455e-09, + "advantage_min": -0.12975230207666755, + "advantage_std": 0.09648847626522183, + "completion_length": 2152.375045776367, + "epoch": 0.27085714285714285, + "grad_norm": 0.015118488110601902, + "kl": 0.009752273559570312, + "learning_rate": 6.679851303883891e-07, + "loss": 0.0075, + "reward": 0.1261741843773052, + "reward_advantage_correlation": 1.0, + "reward_std": 0.096488481387496, + "rewards/cosine_scaled_reward": 0.06003169761970639, + "rewards/format_reward": 0.6250000093132257, + "step": 237 + }, + { + "advantage_max": 0.20382637344300747, + "advantage_mean": -2.483526898022781e-09, + "advantage_min": -0.21393706556409597, + "advantage_std": 0.17641730420291424, + "completion_length": 2390.8958740234375, + "epoch": 0.272, + "grad_norm": 0.02798081561923027, + "kl": 0.01264190673828125, + "learning_rate": 6.649505910711058e-07, + "loss": 0.011, + "reward": 0.17759971413761377, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1764173088595271, + "rewards/cosine_scaled_reward": 0.17150592245161533, + "rewards/format_reward": 0.7083333432674408, + "step": 238 + }, + { + "advantage_max": 0.11714844591915607, + "advantage_mean": -2.0954758137015084e-09, + "advantage_min": -0.14218166377395391, + "advantage_std": 0.10565405956003815, + "completion_length": 1920.2708435058594, + "epoch": 0.27314285714285713, + "grad_norm": 0.012746420688927174, + "kl": 0.00591278076171875, + "learning_rate": 6.619104492241847e-07, + "loss": 0.0072, + "reward": 0.1769508863799274, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10565406491514295, + "rewards/cosine_scaled_reward": 0.17638282477855682, + "rewards/format_reward": 0.6875000074505806, + "step": 239 + }, + { + "advantage_max": 0.11744444910436869, + "advantage_mean": -4.113341431355444e-09, + "advantage_min": -0.08588352426886559, + "advantage_std": 0.07977030100300908, + "completion_length": 2931.6458740234375, + "epoch": 0.2742857142857143, + "grad_norm": 0.03343014419078827, + "kl": 0.0148162841796875, + "learning_rate": 6.588648530198504e-07, + "loss": -0.0001, + "reward": -0.016711448086425662, + "reward_advantage_correlation": 1.0, + "reward_std": 0.07977030146867037, + "rewards/cosine_scaled_reward": -0.2383982054889202, + "rewards/format_reward": 0.3750000037252903, + "step": 240 + }, + { + "advantage_max": 0.15446932334452868, + "advantage_mean": -3.1044083970144243e-10, + "advantage_min": -0.1100171497091651, + "advantage_std": 0.11261474713683128, + "completion_length": 2998.2083892822266, + "epoch": 0.2754285714285714, + "grad_norm": 0.02460714988410473, + "kl": 0.014617919921875, + "learning_rate": 6.558139508961654e-07, + "loss": -0.0017, + "reward": -0.004733615671284497, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11261475319042802, + "rewards/cosine_scaled_reward": -0.20185158215463161, + "rewards/format_reward": 0.3750000037252903, + "step": 241 + }, + { + "advantage_max": 0.17675767396576703, + "advantage_mean": -3.6476801718032803e-09, + "advantage_min": -0.12605122895911336, + "advantage_std": 0.10846545218373649, + "completion_length": 1993.0625305175781, + "epoch": 0.2765714285714286, + "grad_norm": 0.03174210339784622, + "kl": 0.01557159423828125, + "learning_rate": 6.527578915497951e-07, + "loss": 0.0084, + "reward": 0.07294759101932868, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10846545672393404, + "rewards/cosine_scaled_reward": -0.12876278175099287, + "rewards/format_reward": 0.6875000055879354, + "step": 242 + }, + { + "advantage_max": 0.16028737928718328, + "advantage_mean": 1.94025534527853e-09, + "advantage_min": -0.13600675389170647, + "advantage_std": 0.11945267952978611, + "completion_length": 2631.3750762939453, + "epoch": 0.2777142857142857, + "grad_norm": 0.019514845684170723, + "kl": 0.00899505615234375, + "learning_rate": 6.496968239287603e-07, + "loss": 0.0074, + "reward": 0.08974379347637296, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11945268930867314, + "rewards/cosine_scaled_reward": 0.0119549349474255, + "rewards/format_reward": 0.5000000111758709, + "step": 243 + }, + { + "advantage_max": 0.2118885600939393, + "advantage_mean": -7.140140090289293e-09, + "advantage_min": -0.15698527079075575, + "advantage_std": 0.15755242481827736, + "completion_length": 2349.479217529297, + "epoch": 0.27885714285714286, + "grad_norm": 0.04179028794169426, + "kl": 0.010972976684570312, + "learning_rate": 6.466308972251785e-07, + "loss": 0.009, + "reward": 0.11599040031433105, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1575524415820837, + "rewards/cosine_scaled_reward": 0.04908072855323553, + "rewards/format_reward": 0.5833333395421505, + "step": 244 + }, + { + "advantage_max": 0.25635348074138165, + "advantage_mean": -6.208816794028849e-10, + "advantage_min": -0.17878746148198843, + "advantage_std": 0.17052227910608053, + "completion_length": 2936.187545776367, + "epoch": 0.28, + "grad_norm": 0.032245736569166183, + "kl": 0.010873794555664062, + "learning_rate": 6.435602608679916e-07, + "loss": 0.0165, + "reward": 0.04710392498964211, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.17052228096872568, + "rewards/cosine_scaled_reward": -0.049887945875525475, + "rewards/format_reward": 0.37500000931322575, + "step": 245 + }, + { + "advantage_max": 0.19493758492171764, + "advantage_mean": -6.674478622570312e-09, + "advantage_min": -0.17645201738923788, + "advantage_std": 0.14856922021135688, + "completion_length": 2657.6250610351562, + "epoch": 0.28114285714285714, + "grad_norm": 0.034092389047145844, + "kl": 0.01275634765625, + "learning_rate": 6.404850645156841e-07, + "loss": 0.0126, + "reward": 0.12820027698762715, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1485692197456956, + "rewards/cosine_scaled_reward": 0.08732841722667217, + "rewards/format_reward": 0.5833333414047956, + "step": 246 + }, + { + "advantage_max": 0.18537013605237007, + "advantage_mean": 3.5700699618779197e-09, + "advantage_min": -0.12155716121196747, + "advantage_std": 0.12443299405276775, + "completion_length": 2937.5833587646484, + "epoch": 0.2822857142857143, + "grad_norm": 0.023569073528051376, + "kl": 0.0147857666015625, + "learning_rate": 6.374054580489873e-07, + "loss": 0.0026, + "reward": 0.011105528741609305, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.12443300196900964, + "rewards/cosine_scaled_reward": -0.13518046215176582, + "rewards/format_reward": 0.33333333395421505, + "step": 247 + }, + { + "advantage_max": 0.199058273807168, + "advantage_mean": -6.984919739827511e-09, + "advantage_min": -0.16079838667064905, + "advantage_std": 0.13820406701415777, + "completion_length": 2236.7916946411133, + "epoch": 0.2834285714285714, + "grad_norm": 0.02281934767961502, + "kl": 0.012094497680664062, + "learning_rate": 6.343215915635761e-07, + "loss": 0.0039, + "reward": 0.13462523429188877, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13820407399907708, + "rewards/cosine_scaled_reward": 0.10365579696372151, + "rewards/format_reward": 0.5833333376795053, + "step": 248 + }, + { + "advantage_max": 0.10034069698303938, + "advantage_mean": 6.984919517782906e-10, + "advantage_min": -0.12371373269706964, + "advantage_std": 0.09401640691794455, + "completion_length": 2077.8125534057617, + "epoch": 0.2845714285714286, + "grad_norm": 0.017827067524194717, + "kl": 0.011096954345703125, + "learning_rate": 6.31233615362752e-07, + "loss": 0.0095, + "reward": 0.1499801934696734, + "reward_advantage_correlation": 1.0, + "reward_std": 0.0940164087805897, + "rewards/cosine_scaled_reward": 0.11817886866629124, + "rewards/format_reward": 0.6458333358168602, + "step": 249 + }, + { + "advantage_max": 0.17879820056259632, + "advantage_mean": -5.975986677730916e-09, + "advantage_min": -0.11353706941008568, + "advantage_std": 0.1170863639563322, + "completion_length": 2327.854202270508, + "epoch": 0.2857142857142857, + "grad_norm": 0.019272523000836372, + "kl": 0.01653289794921875, + "learning_rate": 6.281416799501187e-07, + "loss": 0.0106, + "reward": 0.049585518427193165, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1170863676816225, + "rewards/cosine_scaled_reward": -0.15713005082216114, + "rewards/format_reward": 0.6041666697710752, + "step": 250 + }, + { + "advantage_max": 0.22082332614809275, + "advantage_mean": -1.7074247710135992e-09, + "advantage_min": -0.15717947948724031, + "advantage_std": 0.14891249779611826, + "completion_length": 1988.0625228881836, + "epoch": 0.28685714285714287, + "grad_norm": 0.022272994741797447, + "kl": 0.012928009033203125, + "learning_rate": 6.25045936022246e-07, + "loss": 0.004, + "reward": 0.10127391491550952, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.14891249779611826, + "rewards/cosine_scaled_reward": -0.05721327941864729, + "rewards/format_reward": 0.708333345130086, + "step": 251 + }, + { + "advantage_max": 0.10648482386022806, + "advantage_mean": 1.6298146165993899e-09, + "advantage_min": -0.12054414115846157, + "advantage_std": 0.09566444996744394, + "completion_length": 2717.187511444092, + "epoch": 0.288, + "grad_norm": 0.016168739646673203, + "kl": 0.011434555053710938, + "learning_rate": 6.219465344613258e-07, + "loss": 0.0024, + "reward": 0.03236317203845829, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.09566445369273424, + "rewards/cosine_scaled_reward": -0.10126717574894428, + "rewards/format_reward": 0.3958333432674408, + "step": 252 + }, + { + "advantage_max": 0.1952264877036214, + "advantage_mean": -2.017865596837254e-09, + "advantage_min": -0.21492321323603392, + "advantage_std": 0.16749495640397072, + "completion_length": 2362.8750228881836, + "epoch": 0.28914285714285715, + "grad_norm": 0.02706560119986534, + "kl": 0.014007568359375, + "learning_rate": 6.188436263278172e-07, + "loss": 0.0082, + "reward": 0.11930957529693842, + "reward_advantage_correlation": 1.0, + "reward_std": 0.16749496012926102, + "rewards/cosine_scaled_reward": 0.03688877751119435, + "rewards/format_reward": 0.6250000149011612, + "step": 253 + }, + { + "advantage_max": 0.26456522569060326, + "advantage_mean": 1.0865430083439875e-09, + "advantage_min": -0.14143134467303753, + "advantage_std": 0.16227889992296696, + "completion_length": 3049.5000610351562, + "epoch": 0.29028571428571426, + "grad_norm": 0.03010343573987484, + "kl": 0.020702362060546875, + "learning_rate": 6.157373628530852e-07, + "loss": 0.0114, + "reward": -0.001374326879158616, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.16227890457957983, + "rewards/cosine_scaled_reward": -0.18095367995556444, + "rewards/format_reward": 0.3541666716337204, + "step": 254 + }, + { + "advantage_max": 0.18009355012327433, + "advantage_mean": -3.764095424241276e-09, + "advantage_min": -0.15930992551147938, + "advantage_std": 0.13817342184484005, + "completion_length": 2864.5833892822266, + "epoch": 0.2914285714285714, + "grad_norm": 0.023358209058642387, + "kl": 0.0135955810546875, + "learning_rate": 6.126278954320294e-07, + "loss": 0.0038, + "reward": 0.0554531047528144, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13817342836409807, + "rewards/cosine_scaled_reward": -0.06686954014003277, + "rewards/format_reward": 0.45833334140479565, + "step": 255 + }, + { + "advantage_max": 0.13159022433683276, + "advantage_mean": -3.5700700312668587e-09, + "advantage_min": -0.16444099321961403, + "advantage_std": 0.11331630731001496, + "completion_length": 2758.104263305664, + "epoch": 0.2925714285714286, + "grad_norm": 0.02609197422862053, + "kl": 0.017688751220703125, + "learning_rate": 6.095153756157051e-07, + "loss": 0.0155, + "reward": 0.041455244878306985, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11331631150096655, + "rewards/cosine_scaled_reward": -0.08538339659571648, + "rewards/format_reward": 0.4166666753590107, + "step": 256 + }, + { + "advantage_max": 0.26734560914337635, + "advantage_mean": 2.949188213086096e-09, + "advantage_min": -0.17771659325808287, + "advantage_std": 0.18554198555648327, + "completion_length": 2963.2500534057617, + "epoch": 0.2937142857142857, + "grad_norm": 0.03234243020415306, + "kl": 0.01363372802734375, + "learning_rate": 6.06399955103937e-07, + "loss": 0.0175, + "reward": 0.11527824534277897, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.18554198974743485, + "rewards/cosine_scaled_reward": 0.12133318744599819, + "rewards/format_reward": 0.4375000074505806, + "step": 257 + }, + { + "advantage_max": 0.14878903981298208, + "advantage_mean": -3.104408258236546e-10, + "advantage_min": -0.2111966870725155, + "advantage_std": 0.14870566432364285, + "completion_length": 2749.2708892822266, + "epoch": 0.2948571428571429, + "grad_norm": 0.025443458929657936, + "kl": 0.012126922607421875, + "learning_rate": 6.032817857379256e-07, + "loss": 0.0117, + "reward": 0.09438971313647926, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.14870566176250577, + "rewards/cosine_scaled_reward": -0.04469215031713247, + "rewards/format_reward": 0.6458333507180214, + "step": 258 + }, + { + "advantage_max": 0.1835308726876974, + "advantage_mean": 4.579002871318849e-09, + "advantage_min": -0.1416417476721108, + "advantage_std": 0.12419275566935539, + "completion_length": 2493.3541717529297, + "epoch": 0.296, + "grad_norm": 0.02159561589360237, + "kl": 0.016582489013671875, + "learning_rate": 6.001610194928464e-07, + "loss": 0.0053, + "reward": 0.08983549298136495, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12419275799766183, + "rewards/cosine_scaled_reward": 0.003949735313653946, + "rewards/format_reward": 0.5208333376795053, + "step": 259 + }, + { + "advantage_max": 0.11710935411974788, + "advantage_mean": -3.7252903400952775e-09, + "advantage_min": -0.14876556862145662, + "advantage_std": 0.112224759766832, + "completion_length": 1975.4166831970215, + "epoch": 0.29714285714285715, + "grad_norm": 0.024187171831727028, + "kl": 0.012000083923339844, + "learning_rate": 5.97037808470444e-07, + "loss": 0.0128, + "reward": 0.16194967506453395, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11222475813701749, + "rewards/cosine_scaled_reward": 0.20593629218637943, + "rewards/format_reward": 0.5416666716337204, + "step": 260 + }, + { + "advantage_max": 0.15087209455668926, + "advantage_mean": -7.761021686425451e-10, + "advantage_min": -0.10865057073533535, + "advantage_std": 0.1102964838501066, + "completion_length": 2710.812545776367, + "epoch": 0.29828571428571427, + "grad_norm": 0.022183962166309357, + "kl": 0.012647628784179688, + "learning_rate": 5.939123048916173e-07, + "loss": 0.0111, + "reward": 0.00019670464098453522, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.11029648408293724, + "rewards/cosine_scaled_reward": -0.22872492298483849, + "rewards/format_reward": 0.4583333395421505, + "step": 261 + }, + { + "advantage_max": 0.1102948416955769, + "advantage_mean": -1.552203920951456e-10, + "advantage_min": -0.1153049236163497, + "advantage_std": 0.09051254531368613, + "completion_length": 2378.937545776367, + "epoch": 0.29942857142857143, + "grad_norm": 0.015410098247230053, + "kl": 0.015895843505859375, + "learning_rate": 5.907846610890011e-07, + "loss": 0.007, + "reward": 0.02980024111457169, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.09051254950463772, + "rewards/cosine_scaled_reward": -0.18379387632012367, + "rewards/format_reward": 0.541666679084301, + "step": 262 + }, + { + "advantage_max": 0.17358731664717197, + "advantage_mean": 1.552203920951456e-10, + "advantage_min": -0.12128621805459261, + "advantage_std": 0.1090443255379796, + "completion_length": 2539.7083892822266, + "epoch": 0.30057142857142854, + "grad_norm": 0.022145798429846764, + "kl": 0.014247894287109375, + "learning_rate": 5.87655029499542e-07, + "loss": 0.0043, + "reward": 0.02160137635655701, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10904432833194733, + "rewards/cosine_scaled_reward": -0.2081189528107643, + "rewards/format_reward": 0.5416666753590107, + "step": 263 + }, + { + "advantage_max": 0.20790673978626728, + "advantage_mean": -1.6298145194548752e-09, + "advantage_min": -0.1554710865020752, + "advantage_std": 0.14528044033795595, + "completion_length": 2800.1250915527344, + "epoch": 0.3017142857142857, + "grad_norm": 0.028843365609645844, + "kl": 0.01845550537109375, + "learning_rate": 5.845235626570683e-07, + "loss": 0.0197, + "reward": 0.05024130782112479, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.1452804459258914, + "rewards/cosine_scaled_reward": -0.0917367022484541, + "rewards/format_reward": 0.47916667349636555, + "step": 264 + }, + { + "advantage_max": 0.12884274497628212, + "advantage_mean": -8.071462553882469e-09, + "advantage_min": -0.17412229906767607, + "advantage_std": 0.12255425984039903, + "completion_length": 2051.02091217041, + "epoch": 0.3028571428571429, + "grad_norm": 0.02178996056318283, + "kl": 0.0155487060546875, + "learning_rate": 5.813904131848564e-07, + "loss": 0.0136, + "reward": 0.12704905099235475, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12255426635965705, + "rewards/cosine_scaled_reward": 0.03132200799882412, + "rewards/format_reward": 0.6875000037252903, + "step": 265 + }, + { + "advantage_max": 0.19498047791421413, + "advantage_mean": 3.259629080543114e-09, + "advantage_min": -0.1322614224627614, + "advantage_std": 0.13702732603996992, + "completion_length": 2982.541732788086, + "epoch": 0.304, + "grad_norm": 0.027023041620850563, + "kl": 0.02156829833984375, + "learning_rate": 5.78255733788191e-07, + "loss": 0.0138, + "reward": -0.0014124545268714428, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1370273269712925, + "rewards/cosine_scaled_reward": -0.1706711007282138, + "rewards/format_reward": 0.3333333358168602, + "step": 266 + }, + { + "advantage_max": 0.146828792989254, + "advantage_mean": -1.396983993762202e-09, + "advantage_min": -0.15964362304657698, + "advantage_std": 0.12036533374339342, + "completion_length": 3103.791702270508, + "epoch": 0.30514285714285716, + "grad_norm": 0.02677420899271965, + "kl": 0.02091217041015625, + "learning_rate": 5.751196772469237e-07, + "loss": 0.0062, + "reward": 0.0031448822701349854, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.12036533374339342, + "rewards/cosine_scaled_reward": -0.1587185263633728, + "rewards/format_reward": 0.33333334140479565, + "step": 267 + }, + { + "advantage_max": 0.24769605975598097, + "advantage_mean": -3.2935835170277983e-09, + "advantage_min": -0.17003098130226135, + "advantage_std": 0.16715412167832255, + "completion_length": 2381.291717529297, + "epoch": 0.3062857142857143, + "grad_norm": 0.03521136939525604, + "kl": 0.022693634033203125, + "learning_rate": 5.71982396408026e-07, + "loss": 0.0072, + "reward": 0.0530893302639015, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.16715412959456444, + "rewards/cosine_scaled_reward": -0.1051311838964466, + "rewards/format_reward": 0.520833345130086, + "step": 268 + }, + { + "advantage_max": 0.19814197719097137, + "advantage_mean": -2.7755575615628914e-17, + "advantage_min": -0.1455021221190691, + "advantage_std": 0.1363191232085228, + "completion_length": 2688.7708740234375, + "epoch": 0.30742857142857144, + "grad_norm": 0.03205736353993416, + "kl": 0.01491546630859375, + "learning_rate": 5.688440441781398e-07, + "loss": 0.0024, + "reward": 0.04180408164393157, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1363191264681518, + "rewards/cosine_scaled_reward": -0.1398752792738378, + "rewards/format_reward": 0.5208333414047956, + "step": 269 + }, + { + "advantage_max": 0.28430189471691847, + "advantage_mean": -2.4059166048306935e-09, + "advantage_min": -0.22107769828289747, + "advantage_std": 0.20366744883358479, + "completion_length": 2568.7500762939453, + "epoch": 0.30857142857142855, + "grad_norm": 0.03902197256684303, + "kl": 0.01998138427734375, + "learning_rate": 5.657047735161255e-07, + "loss": 0.0161, + "reward": 0.1241414062678814, + "reward_advantage_correlation": 1.0, + "reward_std": 0.20366745628416538, + "rewards/cosine_scaled_reward": 0.09125652257353067, + "rewards/format_reward": 0.5416666846722364, + "step": 270 + }, + { + "advantage_max": 0.20672860275954008, + "advantage_mean": -2.9491883102306105e-09, + "advantage_min": -0.2065953854471445, + "advantage_std": 0.16908213449642062, + "completion_length": 2227.3958587646484, + "epoch": 0.3097142857142857, + "grad_norm": 0.030723223462700844, + "kl": 0.013994216918945312, + "learning_rate": 5.625647374256061e-07, + "loss": 0.0136, + "reward": 0.10949450048792642, + "reward_advantage_correlation": 0.9999999999999997, + "reward_std": 0.16908214567229152, + "rewards/cosine_scaled_reward": -0.001862822100520134, + "rewards/format_reward": 0.6458333432674408, + "step": 271 + }, + { + "advantage_max": 0.23002553265541792, + "advantage_mean": -4.190951558014078e-09, + "advantage_min": -0.14808179438114166, + "advantage_std": 0.15896892128512263, + "completion_length": 2769.5834045410156, + "epoch": 0.31085714285714283, + "grad_norm": 0.05374065041542053, + "kl": 0.0186004638671875, + "learning_rate": 5.594240889475106e-07, + "loss": 0.0201, + "reward": 0.03493615868501365, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.15896892687305808, + "rewards/cosine_scaled_reward": -0.10572225786745548, + "rewards/format_reward": 0.41666667349636555, + "step": 272 + }, + { + "advantage_max": 0.19195815734565258, + "advantage_mean": -6.36403778286887e-09, + "advantage_min": -0.1525148469954729, + "advantage_std": 0.14543341007083654, + "completion_length": 2723.666748046875, + "epoch": 0.312, + "grad_norm": 0.03333937004208565, + "kl": 0.016330718994140625, + "learning_rate": 5.562829811526154e-07, + "loss": 0.0138, + "reward": 0.08220908371731639, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.14543340960517526, + "rewards/cosine_scaled_reward": 0.04326418973505497, + "rewards/format_reward": 0.3958333358168602, + "step": 273 + }, + { + "advantage_max": 0.2558351717889309, + "advantage_mean": -6.519258133330652e-09, + "advantage_min": -0.22837742511183023, + "advantage_std": 0.19885483849793673, + "completion_length": 1787.9375343322754, + "epoch": 0.31314285714285717, + "grad_norm": 0.03228946030139923, + "kl": 0.013763427734375, + "learning_rate": 5.531415671340826e-07, + "loss": 0.0107, + "reward": 0.17565890843980014, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.19885484222322702, + "rewards/cosine_scaled_reward": 0.14979170076549053, + "rewards/format_reward": 0.7291666753590107, + "step": 274 + }, + { + "advantage_max": 0.1555377847980708, + "advantage_mean": -9.662471971844111e-09, + "advantage_min": -0.13416611310094595, + "advantage_std": 0.1166462292894721, + "completion_length": 2052.4583740234375, + "epoch": 0.3142857142857143, + "grad_norm": 0.012692847289144993, + "kl": 0.015716552734375, + "learning_rate": 5.5e-07, + "loss": 0.0051, + "reward": 0.169587709242478, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11664623068645597, + "rewards/cosine_scaled_reward": 0.15611401526257396, + "rewards/format_reward": 0.6875, + "step": 275 + }, + { + "advantage_max": 0.22789240814745426, + "advantage_mean": -4.540197690028336e-09, + "advantage_min": -0.1707323812879622, + "advantage_std": 0.16234863363206387, + "completion_length": 2383.229202270508, + "epoch": 0.31542857142857145, + "grad_norm": 0.03404555842280388, + "kl": 0.023883819580078125, + "learning_rate": 5.468584328659172e-07, + "loss": 0.0148, + "reward": 0.097462791018188, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1623486364260316, + "rewards/cosine_scaled_reward": 0.0030499575659632683, + "rewards/format_reward": 0.5625000055879354, + "step": 276 + }, + { + "advantage_max": 0.14935290860012174, + "advantage_mean": 2.755162625822649e-09, + "advantage_min": -0.14729766873642802, + "advantage_std": 0.11353026679717004, + "completion_length": 2168.8958740234375, + "epoch": 0.31657142857142856, + "grad_norm": 0.01725960150361061, + "kl": 0.020036697387695312, + "learning_rate": 5.437170188473847e-07, + "loss": 0.0069, + "reward": 0.08052603248506784, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11353026377037168, + "rewards/cosine_scaled_reward": -0.0758397476747632, + "rewards/format_reward": 0.6250000018626451, + "step": 277 + }, + { + "advantage_max": 0.1460155351087451, + "advantage_mean": -8.459513443914712e-09, + "advantage_min": -0.15751561522483826, + "advantage_std": 0.11409496748819947, + "completion_length": 1924.4792175292969, + "epoch": 0.3177142857142857, + "grad_norm": 0.02192055620253086, + "kl": 0.01718902587890625, + "learning_rate": 5.405759110524894e-07, + "loss": 0.0053, + "reward": 0.17414987459778786, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.11409496935084462, + "rewards/cosine_scaled_reward": 0.07652450073510408, + "rewards/format_reward": 0.8541666828095913, + "step": 278 + }, + { + "advantage_max": 0.18112573074176908, + "advantage_mean": -2.32830644347759e-09, + "advantage_min": -0.1802171589806676, + "advantage_std": 0.1357805049046874, + "completion_length": 2853.2916870117188, + "epoch": 0.31885714285714284, + "grad_norm": 0.045709144324064255, + "kl": 0.02671051025390625, + "learning_rate": 5.37435262574394e-07, + "loss": 0.0191, + "reward": 0.038704983657225966, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.1357805086299777, + "rewards/cosine_scaled_reward": -0.09425987303256989, + "rewards/format_reward": 0.41666668094694614, + "step": 279 + }, + { + "advantage_max": 0.22648306377232075, + "advantage_mean": -3.2596291638098407e-09, + "advantage_min": -0.18116825073957443, + "advantage_std": 0.1603056015446782, + "completion_length": 2309.437545776367, + "epoch": 0.32, + "grad_norm": 0.02816803753376007, + "kl": 0.023956298828125, + "learning_rate": 5.342952264838747e-07, + "loss": 0.0099, + "reward": 0.15197777177672833, + "reward_advantage_correlation": 1.0, + "reward_std": 0.16030560433864594, + "rewards/cosine_scaled_reward": 0.12034583219792694, + "rewards/format_reward": 0.6458333414047956, + "step": 280 + }, + { + "advantage_max": 0.14990063337609172, + "advantage_mean": -7.761022102759085e-10, + "advantage_min": -0.10904566152021289, + "advantage_std": 0.10730710672214627, + "completion_length": 3093.2708435058594, + "epoch": 0.3211428571428571, + "grad_norm": 0.024048512801527977, + "kl": 0.019947052001953125, + "learning_rate": 5.311559558218603e-07, + "loss": 0.0088, + "reward": -0.002999696182087064, + "reward_advantage_correlation": 1.0, + "reward_std": 0.107307109516114, + "rewards/cosine_scaled_reward": -0.17536162724718451, + "rewards/format_reward": 0.3333333432674408, + "step": 281 + }, + { + "advantage_max": 0.13856762740761042, + "advantage_mean": -2.7939678071131624e-09, + "advantage_min": -0.16603135224431753, + "advantage_std": 0.11764003010466695, + "completion_length": 2342.416732788086, + "epoch": 0.3222857142857143, + "grad_norm": 0.03633342310786247, + "kl": 0.0218658447265625, + "learning_rate": 5.28017603591974e-07, + "loss": 0.0024, + "reward": 0.1459582296665758, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1176400319673121, + "rewards/cosine_scaled_reward": 0.1172967292368412, + "rewards/format_reward": 0.6250000037252903, + "step": 282 + }, + { + "advantage_max": 0.19806544668972492, + "advantage_mean": -2.4835269951672956e-09, + "advantage_min": -0.18965621013194323, + "advantage_std": 0.16816747142001987, + "completion_length": 2804.6459045410156, + "epoch": 0.32342857142857145, + "grad_norm": 0.04573136195540428, + "kl": 0.01639556884765625, + "learning_rate": 5.248803227530763e-07, + "loss": 0.0092, + "reward": 0.13224803050979972, + "reward_advantage_correlation": 1.0, + "reward_std": 0.16816747328266501, + "rewards/cosine_scaled_reward": 0.17202033032663167, + "rewards/format_reward": 0.4375000037252903, + "step": 283 + }, + { + "advantage_max": 0.18402406759560108, + "advantage_mean": -2.9491882894139287e-09, + "advantage_min": -0.22102447994984686, + "advantage_std": 0.15604595269542187, + "completion_length": 2228.729217529297, + "epoch": 0.32457142857142857, + "grad_norm": 0.03294950723648071, + "kl": 0.020061492919921875, + "learning_rate": 5.21744266211809e-07, + "loss": 0.0181, + "reward": 0.10767308878712356, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.15604595455806702, + "rewards/cosine_scaled_reward": 0.011314274743199348, + "rewards/format_reward": 0.6041666939854622, + "step": 284 + }, + { + "advantage_max": 0.16824073251336813, + "advantage_mean": -2.949188192269414e-09, + "advantage_min": -0.12993684597313404, + "advantage_std": 0.12016739509999752, + "completion_length": 1927.5208587646484, + "epoch": 0.32571428571428573, + "grad_norm": 0.02860177308320999, + "kl": 0.021915435791015625, + "learning_rate": 5.186095868151436e-07, + "loss": -0.0004, + "reward": 0.10571011027786881, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12016740208491683, + "rewards/cosine_scaled_reward": -0.06410545017570257, + "rewards/format_reward": 0.7500000055879354, + "step": 285 + }, + { + "advantage_max": 0.19652512576431036, + "advantage_mean": -2.561137010803627e-09, + "advantage_min": -0.18228960130363703, + "advantage_std": 0.15237156953662634, + "completion_length": 2139.7916946411133, + "epoch": 0.32685714285714285, + "grad_norm": 0.06984082609415054, + "kl": 0.0286865234375, + "learning_rate": 5.154764373429315e-07, + "loss": -0.0004, + "reward": 0.09307598043233156, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1523715741932392, + "rewards/cosine_scaled_reward": -0.043882093974389136, + "rewards/format_reward": 0.6250000093132257, + "step": 286 + }, + { + "advantage_max": 0.10403442522510886, + "advantage_mean": 2.0954757928848267e-09, + "advantage_min": -0.11645598197355866, + "advantage_std": 0.09277561539784074, + "completion_length": 1798.8958587646484, + "epoch": 0.328, + "grad_norm": 0.025246037170290947, + "kl": 0.018194198608398438, + "learning_rate": 5.123449705004581e-07, + "loss": 0.002, + "reward": 0.08277540933340788, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09277561772614717, + "rewards/cosine_scaled_reward": -0.07126312516629696, + "rewards/format_reward": 0.6250000055879354, + "step": 287 + }, + { + "advantage_max": 0.1896635489538312, + "advantage_mean": -1.552204503818544e-09, + "advantage_min": -0.14109544549137354, + "advantage_std": 0.12216217769309878, + "completion_length": 2708.500068664551, + "epoch": 0.3291428571428571, + "grad_norm": 0.036087606102228165, + "kl": 0.029327392578125, + "learning_rate": 5.09215338910999e-07, + "loss": 0.0044, + "reward": 0.031016689725220203, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12216217769309878, + "rewards/cosine_scaled_reward": -0.11878702905960381, + "rewards/format_reward": 0.41666666977107525, + "step": 288 + }, + { + "advantage_max": 0.1401263326406479, + "advantage_mean": 2.4447218693879336e-09, + "advantage_min": -0.14393609017133713, + "advantage_std": 0.1113440697081387, + "completion_length": 2145.3541679382324, + "epoch": 0.3302857142857143, + "grad_norm": 0.0327521413564682, + "kl": 0.02288055419921875, + "learning_rate": 5.060876951083828e-07, + "loss": 0.0016, + "reward": 0.06709581252653152, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11134407250210643, + "rewards/cosine_scaled_reward": -0.07541561592370272, + "rewards/format_reward": 0.5416666679084301, + "step": 289 + }, + { + "advantage_max": 0.16939363535493612, + "advantage_mean": -5.161079424942372e-09, + "advantage_min": -0.1828483436256647, + "advantage_std": 0.13901030272245407, + "completion_length": 1707.9583854675293, + "epoch": 0.3314285714285714, + "grad_norm": 0.05445309728384018, + "kl": 0.017580032348632812, + "learning_rate": 5.02962191529556e-07, + "loss": 0.0129, + "reward": 0.1843718090094626, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.1390103055164218, + "rewards/cosine_scaled_reward": 0.1699133664369583, + "rewards/format_reward": 0.7500000204890966, + "step": 290 + }, + { + "advantage_max": 0.25220514833927155, + "advantage_mean": -9.158005617737608e-09, + "advantage_min": -0.1723672477528453, + "advantage_std": 0.16362386476248503, + "completion_length": 1981.354248046875, + "epoch": 0.3325714285714286, + "grad_norm": 0.061871450394392014, + "kl": 0.0141448974609375, + "learning_rate": 4.998389805071536e-07, + "loss": 0.0159, + "reward": 0.09107551211491227, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.16362386848777533, + "rewards/cosine_scaled_reward": -0.07718436582945287, + "rewards/format_reward": 0.687500013038516, + "step": 291 + }, + { + "advantage_max": 0.16672541294246912, + "advantage_mean": -1.3193737213867962e-09, + "advantage_min": -0.12999323196709156, + "advantage_std": 0.11027388600632548, + "completion_length": 2551.8750228881836, + "epoch": 0.33371428571428574, + "grad_norm": 0.02972051128745079, + "kl": 0.026575088500976562, + "learning_rate": 4.967182142620745e-07, + "loss": 0.0011, + "reward": 0.02736746583832428, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11027389112859964, + "rewards/cosine_scaled_reward": -0.17034983914345503, + "rewards/format_reward": 0.5000000111758709, + "step": 292 + }, + { + "advantage_max": 0.1079831887036562, + "advantage_mean": -1.9402554424230445e-09, + "advantage_min": -0.12870269175618887, + "advantage_std": 0.09040121687576175, + "completion_length": 1931.1250839233398, + "epoch": 0.33485714285714285, + "grad_norm": 0.016690244898200035, + "kl": 0.03152656555175781, + "learning_rate": 4.93600044896063e-07, + "loss": 0.0073, + "reward": 0.10857149804360233, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09040121641010046, + "rewards/cosine_scaled_reward": -0.013043075799942017, + "rewards/format_reward": 0.6666666772216558, + "step": 293 + }, + { + "advantage_max": 0.14342291373759508, + "advantage_mean": -5.551115123125783e-17, + "advantage_min": -0.17630695085972548, + "advantage_std": 0.11805281089618802, + "completion_length": 2847.229217529297, + "epoch": 0.336, + "grad_norm": 0.04706273600459099, + "kl": 0.028173446655273438, + "learning_rate": 4.904846243842949e-07, + "loss": 0.0118, + "reward": 0.08359809592366219, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11805281648412347, + "rewards/cosine_scaled_reward": 0.04589838907122612, + "rewards/format_reward": 0.39583334513008595, + "step": 294 + }, + { + "advantage_max": 0.10511154402047396, + "advantage_mean": 1.9790605126912553e-09, + "advantage_min": -0.10564734903164208, + "advantage_std": 0.08633032138459384, + "completion_length": 3080.7291870117188, + "epoch": 0.33714285714285713, + "grad_norm": 0.05320084095001221, + "kl": 0.040599822998046875, + "learning_rate": 4.873721045679706e-07, + "loss": 0.0015, + "reward": 0.02853070362471044, + "reward_advantage_correlation": 1.0, + "reward_std": 0.08633032557554543, + "rewards/cosine_scaled_reward": -0.05254795402288437, + "rewards/format_reward": 0.2708333395421505, + "step": 295 + }, + { + "advantage_max": 0.2000164007768035, + "advantage_mean": 9.313225884932663e-10, + "advantage_min": -0.16814672946929932, + "advantage_std": 0.14298693323507905, + "completion_length": 3020.104217529297, + "epoch": 0.3382857142857143, + "grad_norm": 0.06135251745581627, + "kl": 0.0340576171875, + "learning_rate": 4.842626371469149e-07, + "loss": 0.0194, + "reward": 0.02324377093464136, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.14298693649470806, + "rewards/cosine_scaled_reward": -0.10881040431559086, + "rewards/format_reward": 0.35416667722165585, + "step": 296 + }, + { + "advantage_max": 0.14637420466169715, + "advantage_mean": 2.9491883241083983e-09, + "advantage_min": -0.11458162683993578, + "advantage_std": 0.09986498206853867, + "completion_length": 3078.687545776367, + "epoch": 0.3394285714285714, + "grad_norm": 0.027566736564040184, + "kl": 0.0302581787109375, + "learning_rate": 4.811563736721829e-07, + "loss": 0.011, + "reward": -0.021376774879172444, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09986498579382896, + "rewards/cosine_scaled_reward": -0.13615123182535172, + "rewards/format_reward": 0.1458333395421505, + "step": 297 + }, + { + "advantage_max": 0.171283058822155, + "advantage_mean": -4.656613150633149e-10, + "advantage_min": -0.10321410931646824, + "advantage_std": 0.10990889801178128, + "completion_length": 2413.7083892822266, + "epoch": 0.3405714285714286, + "grad_norm": 0.03488784655928612, + "kl": 0.023456573486328125, + "learning_rate": 4.780534655386743e-07, + "loss": 0.012, + "reward": 0.07853745546890423, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.10990889696404338, + "rewards/cosine_scaled_reward": -0.02953695846372284, + "rewards/format_reward": 0.5208333469927311, + "step": 298 + }, + { + "advantage_max": 0.1128107258118689, + "advantage_mean": -3.065603638996439e-09, + "advantage_min": -0.1558314487338066, + "advantage_std": 0.11369897332042456, + "completion_length": 2749.104202270508, + "epoch": 0.3417142857142857, + "grad_norm": 0.030868861824274063, + "kl": 0.029834747314453125, + "learning_rate": 4.749540639777539e-07, + "loss": 0.004, + "reward": 0.09189531486481428, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.113698975648731, + "rewards/cosine_scaled_reward": 0.06086218822747469, + "rewards/format_reward": 0.4166666716337204, + "step": 299 + }, + { + "advantage_max": 0.1933749821037054, + "advantage_mean": 5.432715666220389e-10, + "advantage_min": -0.1367768244817853, + "advantage_std": 0.1271583898924291, + "completion_length": 3239.916717529297, + "epoch": 0.34285714285714286, + "grad_norm": 0.03370937705039978, + "kl": 0.04051971435546875, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.0131, + "reward": -0.02385653683450073, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12715838942676783, + "rewards/cosine_scaled_reward": -0.20604670932516456, + "rewards/format_reward": 0.2708333395421505, + "step": 300 + }, + { + "advantage_max": 0.17281748075038195, + "advantage_mean": -1.9887617525027323e-09, + "advantage_min": -0.14648894406855106, + "advantage_std": 0.12490739766508341, + "completion_length": 2306.6875610351562, + "epoch": 0.344, + "grad_norm": 0.06356479972600937, + "kl": 0.030529022216796875, + "learning_rate": 4.68766384637248e-07, + "loss": 0.0186, + "reward": 0.06374910019803792, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.12490739580243826, + "rewards/cosine_scaled_reward": -0.12478701584041119, + "rewards/format_reward": 0.6250000055879354, + "step": 301 + }, + { + "advantage_max": 0.20410604868084192, + "advantage_mean": 6.208817071584605e-10, + "advantage_min": -0.11600066442042589, + "advantage_std": 0.12798969075083733, + "completion_length": 2438.166702270508, + "epoch": 0.34514285714285714, + "grad_norm": 0.050883322954177856, + "kl": 0.026035308837890625, + "learning_rate": 4.656784084364238e-07, + "loss": 0.0184, + "reward": 0.04835976893082261, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12798968655988574, + "rewards/cosine_scaled_reward": -0.059276397922076285, + "rewards/format_reward": 0.3958333395421505, + "step": 302 + }, + { + "advantage_max": 0.1770703885704279, + "advantage_mean": -5.122274340796373e-09, + "advantage_min": -0.1827222853899002, + "advantage_std": 0.1458009947091341, + "completion_length": 1774.208366394043, + "epoch": 0.3462857142857143, + "grad_norm": 0.062340594828128815, + "kl": 0.02555084228515625, + "learning_rate": 4.6259454195101267e-07, + "loss": 0.0184, + "reward": 0.10038578975945711, + "reward_advantage_correlation": 1.0, + "reward_std": 0.14580100309103727, + "rewards/cosine_scaled_reward": -0.06850272964220494, + "rewards/format_reward": 0.7291666865348816, + "step": 303 + }, + { + "advantage_max": 0.21447939984500408, + "advantage_mean": -3.2596290527875382e-09, + "advantage_min": -0.15638948930427432, + "advantage_std": 0.13950767274945974, + "completion_length": 2737.937545776367, + "epoch": 0.3474285714285714, + "grad_norm": 0.03145613148808479, + "kl": 0.0426483154296875, + "learning_rate": 4.59514935484316e-07, + "loss": 0.0069, + "reward": 0.06072054826654494, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13950766902416945, + "rewards/cosine_scaled_reward": -0.07232037000358105, + "rewards/format_reward": 0.5000000149011612, + "step": 304 + }, + { + "advantage_max": 0.16362837608903646, + "advantage_mean": 2.0178656107150417e-09, + "advantage_min": -0.1397431530058384, + "advantage_std": 0.11684367200359702, + "completion_length": 2308.729202270508, + "epoch": 0.3485714285714286, + "grad_norm": 0.02347092516720295, + "kl": 0.024188995361328125, + "learning_rate": 4.5643973913200837e-07, + "loss": 0.0055, + "reward": 0.04415482934564352, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1168436761945486, + "rewards/cosine_scaled_reward": -0.12959892745129764, + "rewards/format_reward": 0.5208333488553762, + "step": 305 + }, + { + "advantage_max": 0.1916728913784027, + "advantage_mean": -1.3969838202898543e-09, + "advantage_min": -0.12143237423151731, + "advantage_std": 0.12374015152454376, + "completion_length": 2760.2500610351562, + "epoch": 0.3497142857142857, + "grad_norm": 0.05556763336062431, + "kl": 0.0508575439453125, + "learning_rate": 4.5336910277482155e-07, + "loss": 0.0157, + "reward": 0.04871231457218528, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12374015431851149, + "rewards/cosine_scaled_reward": -0.05752767622470856, + "rewards/format_reward": 0.3958333358168602, + "step": 306 + }, + { + "advantage_max": 0.22683174721896648, + "advantage_mean": -3.570070017389071e-09, + "advantage_min": -0.19085029885172844, + "advantage_std": 0.16714101657271385, + "completion_length": 2140.104217529297, + "epoch": 0.35085714285714287, + "grad_norm": 0.04561980441212654, + "kl": 0.0324554443359375, + "learning_rate": 4.503031760712397e-07, + "loss": 0.0103, + "reward": 0.10696669295430183, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1671410221606493, + "rewards/cosine_scaled_reward": 0.03263398795388639, + "rewards/format_reward": 0.5625000111758709, + "step": 307 + }, + { + "advantage_max": 0.17833841359242797, + "advantage_mean": -1.5522043372850902e-09, + "advantage_min": -0.1457248479127884, + "advantage_std": 0.1348523572087288, + "completion_length": 3109.5208740234375, + "epoch": 0.352, + "grad_norm": 0.038504425436258316, + "kl": 0.0427093505859375, + "learning_rate": 4.4724210845020494e-07, + "loss": 0.0046, + "reward": 0.03145545581355691, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13485235767439008, + "rewards/cosine_scaled_reward": -0.07493541622534394, + "rewards/format_reward": 0.3333333395421505, + "step": 308 + }, + { + "advantage_max": 0.22489432198926806, + "advantage_mean": -2.1730860721991263e-09, + "advantage_min": -0.17644386645406485, + "advantage_std": 0.16854400280863047, + "completion_length": 2918.7083892822266, + "epoch": 0.35314285714285715, + "grad_norm": 0.05082552134990692, + "kl": 0.040985107421875, + "learning_rate": 4.441860491038345e-07, + "loss": 0.0018, + "reward": 0.06658328603953123, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.16854400653392076, + "rewards/cosine_scaled_reward": -0.033274039917159826, + "rewards/format_reward": 0.4583333432674408, + "step": 309 + }, + { + "advantage_max": 0.14540826249867678, + "advantage_mean": -1.2417634420724966e-09, + "advantage_min": -0.10229035327211022, + "advantage_std": 0.09198084427043796, + "completion_length": 2406.8750228881836, + "epoch": 0.35428571428571426, + "grad_norm": 0.04952380806207657, + "kl": 0.0579376220703125, + "learning_rate": 4.4113514698014953e-07, + "loss": 0.0055, + "reward": 0.02953993622213602, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.09198084473609924, + "rewards/cosine_scaled_reward": -0.18812143243849277, + "rewards/format_reward": 0.5416666679084301, + "step": 310 + }, + { + "advantage_max": 0.2078695334494114, + "advantage_mean": -4.8118333900726284e-09, + "advantage_min": -0.1414637891575694, + "advantage_std": 0.13211291236802936, + "completion_length": 2218.9792251586914, + "epoch": 0.3554285714285714, + "grad_norm": 0.0518975704908371, + "kl": 0.038421630859375, + "learning_rate": 4.3808955077581546e-07, + "loss": 0.0172, + "reward": 0.09966501512099057, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13211291562765837, + "rewards/cosine_scaled_reward": 0.021480013616383076, + "rewards/format_reward": 0.541666679084301, + "step": 311 + }, + { + "advantage_max": 0.11580835562199354, + "advantage_mean": -3.0267983952558808e-09, + "advantage_min": -0.13315439969301224, + "advantage_std": 0.09709975449368358, + "completion_length": 2090.3750381469727, + "epoch": 0.3565714285714286, + "grad_norm": 0.05046186223626137, + "kl": 0.0472564697265625, + "learning_rate": 4.350494089288943e-07, + "loss": 0.0035, + "reward": 0.12841306265909225, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.09709975705482066, + "rewards/cosine_scaled_reward": 0.11523362691514194, + "rewards/format_reward": 0.5208333488553762, + "step": 312 + }, + { + "advantage_max": 0.13410293869674206, + "advantage_mean": -7.062529679136009e-09, + "advantage_min": -0.15682349167764187, + "advantage_std": 0.11782865854911506, + "completion_length": 2920.4167289733887, + "epoch": 0.3577142857142857, + "grad_norm": 0.05611386522650719, + "kl": 0.042430877685546875, + "learning_rate": 4.3201486961161093e-07, + "loss": 0.0011, + "reward": 0.10145594039931893, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11782865924760699, + "rewards/cosine_scaled_reward": 0.10086375288665295, + "rewards/format_reward": 0.39583333767950535, + "step": 313 + }, + { + "advantage_max": 0.14438471477478743, + "advantage_mean": -3.1044086745701804e-09, + "advantage_min": -0.17650274466723204, + "advantage_std": 0.11929162684828043, + "completion_length": 1980.0417137145996, + "epoch": 0.3588571428571429, + "grad_norm": 0.07130403071641922, + "kl": 0.03459930419921875, + "learning_rate": 4.2898608072313045e-07, + "loss": 0.0125, + "reward": 0.12104845186695457, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11929162871092558, + "rewards/cosine_scaled_reward": 0.010619329288601875, + "rewards/format_reward": 0.687500013038516, + "step": 314 + }, + { + "advantage_max": 0.13581964280456305, + "advantage_mean": 3.1044089521259366e-10, + "advantage_min": -0.11171440966427326, + "advantage_std": 0.1006471742875874, + "completion_length": 2960.916732788086, + "epoch": 0.36, + "grad_norm": 0.07105167210102081, + "kl": 0.06281280517578125, + "learning_rate": 4.2596318988235037e-07, + "loss": 0.0045, + "reward": 0.10702863708138466, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10064717940986156, + "rewards/cosine_scaled_reward": 0.07692010141909122, + "rewards/format_reward": 0.47916666977107525, + "step": 315 + }, + { + "advantage_max": 0.22751855989918113, + "advantage_mean": -7.761023768093622e-11, + "advantage_min": -0.15471346024423838, + "advantage_std": 0.15318563301116228, + "completion_length": 2986.5834350585938, + "epoch": 0.36114285714285715, + "grad_norm": 0.08785312622785568, + "kl": 0.0533447265625, + "learning_rate": 4.2294634442070553e-07, + "loss": 0.0193, + "reward": -0.0024650731356814504, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.15318563859909773, + "rewards/cosine_scaled_reward": -0.14366810489445925, + "rewards/format_reward": 0.2708333395421505, + "step": 316 + }, + { + "advantage_max": 0.20047110598534346, + "advantage_mean": -5.122274229774071e-09, + "advantage_min": -0.1447579087689519, + "advantage_std": 0.13735897513106465, + "completion_length": 2916.2917404174805, + "epoch": 0.36228571428571427, + "grad_norm": 0.034367047250270844, + "kl": 0.05880928039550781, + "learning_rate": 4.1993569137498776e-07, + "loss": 0.0085, + "reward": 0.03647155943326652, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13735897792503238, + "rewards/cosine_scaled_reward": -0.07012255070731044, + "rewards/format_reward": 0.35416667349636555, + "step": 317 + }, + { + "advantage_max": 0.1579853631556034, + "advantage_mean": -1.5522041985072121e-10, + "advantage_min": -0.11961670313030481, + "advantage_std": 0.10564623354002833, + "completion_length": 2064.000011444092, + "epoch": 0.36342857142857143, + "grad_norm": 0.04541625827550888, + "kl": 0.0489959716796875, + "learning_rate": 4.1693137748017915e-07, + "loss": 0.0173, + "reward": 0.04783309390768409, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10564623679965734, + "rewards/cosine_scaled_reward": -0.20319920778274536, + "rewards/format_reward": 0.6875000111758709, + "step": 318 + }, + { + "advantage_max": 0.1397206410765648, + "advantage_mean": 1.5522041985072121e-10, + "advantage_min": -0.11532348208129406, + "advantage_std": 0.10406744293868542, + "completion_length": 2528.000045776367, + "epoch": 0.36457142857142855, + "grad_norm": 0.055004946887493134, + "kl": 0.05081939697265625, + "learning_rate": 4.1393354916230005e-07, + "loss": 0.0142, + "reward": 0.02235355321317911, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.10406744759529829, + "rewards/cosine_scaled_reward": -0.1417781561613083, + "rewards/format_reward": 0.41666666977107525, + "step": 319 + }, + { + "advantage_max": 0.1373197054490447, + "advantage_mean": -2.4059166568723978e-09, + "advantage_min": -0.09656485263258219, + "advantage_std": 0.09253839054144919, + "completion_length": 2120.916732788086, + "epoch": 0.3657142857142857, + "grad_norm": 0.08500746637582779, + "kl": 0.058597564697265625, + "learning_rate": 4.1094235253127374e-07, + "loss": 0.0016, + "reward": 0.1001118189888075, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09253839147277176, + "rewards/cosine_scaled_reward": -0.06903557199984789, + "rewards/format_reward": 0.7291666679084301, + "step": 320 + }, + { + "advantage_max": 0.18437588680535555, + "advantage_mean": -7.140139798855749e-09, + "advantage_min": -0.1534122722223401, + "advantage_std": 0.13574167201295495, + "completion_length": 1700.7292175292969, + "epoch": 0.3668571428571429, + "grad_norm": 0.059033576399087906, + "kl": 0.0357208251953125, + "learning_rate": 4.079579333738039e-07, + "loss": 0.0032, + "reward": 0.1664855630369857, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1357416776008904, + "rewards/cosine_scaled_reward": 0.10123182460665703, + "rewards/format_reward": 0.7708333414047956, + "step": 321 + }, + { + "advantage_max": 0.20797332702204585, + "advantage_mean": 1.9014503027658947e-09, + "advantage_min": -0.13548879977315664, + "advantage_std": 0.145243885461241, + "completion_length": 2375.0833892822266, + "epoch": 0.368, + "grad_norm": 0.058852821588516235, + "kl": 0.062713623046875, + "learning_rate": 4.0498043714627006e-07, + "loss": 0.01, + "reward": 0.014190776884788647, + "reward_advantage_correlation": 1.0, + "reward_std": 0.14524389104917645, + "rewards/cosine_scaled_reward": -0.16725354408845305, + "rewards/format_reward": 0.4166666716337204, + "step": 322 + }, + { + "advantage_max": 0.1604612385854125, + "advantage_mean": -2.949188129819369e-09, + "advantage_min": -0.152086915448308, + "advantage_std": 0.12315284926444292, + "completion_length": 2394.291717529297, + "epoch": 0.36914285714285716, + "grad_norm": 0.03161586821079254, + "kl": 0.04833984375, + "learning_rate": 4.020100089676376e-07, + "loss": 0.0084, + "reward": 0.03113732289057225, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12315285205841064, + "rewards/cosine_scaled_reward": -0.1679130750708282, + "rewards/format_reward": 0.5208333544433117, + "step": 323 + }, + { + "advantage_max": 0.1508808215148747, + "advantage_mean": 1.3877787807814457e-17, + "advantage_min": -0.12193193286657333, + "advantage_std": 0.10500428173691034, + "completion_length": 2905.0209045410156, + "epoch": 0.3702857142857143, + "grad_norm": 0.04525084048509598, + "kl": 0.064605712890625, + "learning_rate": 3.9904679361238526e-07, + "loss": 0.0093, + "reward": -0.02125760749913752, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10500428825616837, + "rewards/cosine_scaled_reward": -0.1961777014657855, + "rewards/format_reward": 0.2708333358168602, + "step": 324 + }, + { + "advantage_max": 0.1821247013285756, + "advantage_mean": -3.725290437239792e-09, + "advantage_min": -0.20376018062233925, + "advantage_std": 0.1471891412511468, + "completion_length": 2759.437515258789, + "epoch": 0.37142857142857144, + "grad_norm": 0.06518473476171494, + "kl": 0.0437469482421875, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0159, + "reward": 0.0820797230117023, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.14718915149569511, + "rewards/cosine_scaled_reward": 0.03319347696378827, + "rewards/format_reward": 0.41666668094694614, + "step": 325 + }, + { + "advantage_max": 0.16853288700804114, + "advantage_mean": -8.53712503467996e-10, + "advantage_min": -0.1664345981553197, + "advantage_std": 0.1293939589522779, + "completion_length": 2176.375045776367, + "epoch": 0.37257142857142855, + "grad_norm": 0.050375796854496, + "kl": 0.0517578125, + "learning_rate": 3.931425787051832e-07, + "loss": 0.0125, + "reward": 0.07511803903616965, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1293939701281488, + "rewards/cosine_scaled_reward": -0.030897012911736965, + "rewards/format_reward": 0.5000000167638063, + "step": 326 + }, + { + "advantage_max": 0.1837756261229515, + "advantage_mean": -3.259629136054265e-09, + "advantage_min": -0.18448743131011724, + "advantage_std": 0.14881908521056175, + "completion_length": 2198.8333740234375, + "epoch": 0.3737142857142857, + "grad_norm": 0.08379311859607697, + "kl": 0.03250885009765625, + "learning_rate": 3.902018669163384e-07, + "loss": 0.0077, + "reward": 0.14384737284854054, + "reward_advantage_correlation": 1.0, + "reward_std": 0.14881909638643265, + "rewards/cosine_scaled_reward": 0.10746606485918164, + "rewards/format_reward": 0.625, + "step": 327 + }, + { + "advantage_max": 0.16970978770405054, + "advantage_mean": 2.9103831011845216e-09, + "advantage_min": -0.10629498213529587, + "advantage_std": 0.1124201756902039, + "completion_length": 2977.7291870117188, + "epoch": 0.37485714285714283, + "grad_norm": 0.04713457077741623, + "kl": 0.0502471923828125, + "learning_rate": 3.872689434630585e-07, + "loss": 0.0091, + "reward": -0.021038700826466084, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.11242017382755876, + "rewards/cosine_scaled_reward": -0.20708716148510575, + "rewards/format_reward": 0.29166667722165585, + "step": 328 + }, + { + "advantage_max": 0.15668292297050357, + "advantage_mean": -7.295360440751075e-09, + "advantage_min": -0.20917884726077318, + "advantage_std": 0.15731059899553657, + "completion_length": 1727.7916851043701, + "epoch": 0.376, + "grad_norm": 0.07096515595912933, + "kl": 0.0440673828125, + "learning_rate": 3.843439512918949e-07, + "loss": 0.0088, + "reward": 0.15083087398670614, + "reward_advantage_correlation": 1.0, + "reward_std": 0.157310601323843, + "rewards/cosine_scaled_reward": 0.10906307026743889, + "rewards/format_reward": 0.6666666734963655, + "step": 329 + }, + { + "advantage_max": 0.16370283998548985, + "advantage_mean": 2.5611371773370806e-09, + "advantage_min": -0.13650465942919254, + "advantage_std": 0.11045987298712134, + "completion_length": 2315.125045776367, + "epoch": 0.37714285714285717, + "grad_norm": 0.10308735072612762, + "kl": 0.056667327880859375, + "learning_rate": 3.8142703296283953e-07, + "loss": 0.0108, + "reward": 0.042721322970464826, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.11045988043770194, + "rewards/cosine_scaled_reward": -0.16713772248476744, + "rewards/format_reward": 0.5833333395421505, + "step": 330 + }, + { + "advantage_max": 0.1993811996653676, + "advantage_mean": -1.8626451908643205e-09, + "advantage_min": -0.11707951128482819, + "advantage_std": 0.11946522584185004, + "completion_length": 2759.4166946411133, + "epoch": 0.3782857142857143, + "grad_norm": 0.06283679604530334, + "kl": 0.05450439453125, + "learning_rate": 3.785183306423767e-07, + "loss": 0.0064, + "reward": -0.007956791669130325, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.11946522584185004, + "rewards/cosine_scaled_reward": -0.17038972454611212, + "rewards/format_reward": 0.2916666679084301, + "step": 331 + }, + { + "advantage_max": 0.1879209829494357, + "advantage_mean": -7.761021547647573e-10, + "advantage_min": -0.1391853764653206, + "advantage_std": 0.13691260432824492, + "completion_length": 2184.2083740234375, + "epoch": 0.37942857142857145, + "grad_norm": 0.08008905500173569, + "kl": 0.04682159423828125, + "learning_rate": 3.7561798609655373e-07, + "loss": -0.0015, + "reward": 0.0423368806950748, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13691260758787394, + "rewards/cosine_scaled_reward": -0.14775002468377352, + "rewards/format_reward": 0.5416666734963655, + "step": 332 + }, + { + "advantage_max": 0.15720007987692952, + "advantage_mean": -1.7850349878778538e-09, + "advantage_min": -0.18071353994309902, + "advantage_std": 0.13339901249855757, + "completion_length": 2106.9792137145996, + "epoch": 0.38057142857142856, + "grad_norm": 0.03596104308962822, + "kl": 0.0339202880859375, + "learning_rate": 3.72726140684072e-07, + "loss": 0.0019, + "reward": 0.12304100673645735, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13339901715517044, + "rewards/cosine_scaled_reward": -0.006082692489144392, + "rewards/format_reward": 0.729166679084301, + "step": 333 + }, + { + "advantage_max": 0.14941539708524942, + "advantage_mean": 4.656613011855271e-10, + "advantage_min": -0.1259508691728115, + "advantage_std": 0.10819857241585851, + "completion_length": 3429.9375610351562, + "epoch": 0.38171428571428573, + "grad_norm": 0.06314020603895187, + "kl": 0.057037353515625, + "learning_rate": 3.6984293534939737e-07, + "loss": 0.0079, + "reward": -0.027071162359789014, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.10819857893511653, + "rewards/cosine_scaled_reward": -0.16390738973859698, + "rewards/format_reward": 0.16666667349636555, + "step": 334 + }, + { + "advantage_max": 0.09907219745218754, + "advantage_mean": -1.1486311984887365e-08, + "advantage_min": -0.15171983744949102, + "advantage_std": 0.09425494028255343, + "completion_length": 2277.7500381469727, + "epoch": 0.38285714285714284, + "grad_norm": 0.038017332553863525, + "kl": 0.035099029541015625, + "learning_rate": 3.6696851061588994e-07, + "loss": 0.0047, + "reward": 0.14433281571837142, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.09425493981689215, + "rewards/cosine_scaled_reward": 0.12261204607784748, + "rewards/format_reward": 0.6041666679084301, + "step": 335 + }, + { + "advantage_max": 0.21837344765663147, + "advantage_mean": -6.053596901534064e-09, + "advantage_min": -0.17420747876167297, + "advantage_std": 0.1544438600540161, + "completion_length": 2266.6458892822266, + "epoch": 0.384, + "grad_norm": 0.0935477614402771, + "kl": 0.033916473388671875, + "learning_rate": 3.641030065789562e-07, + "loss": 0.0221, + "reward": 0.11615504696965218, + "reward_advantage_correlation": 1.0, + "reward_std": 0.15444386564195156, + "rewards/cosine_scaled_reward": 0.10184320248663425, + "rewards/format_reward": 0.47916667722165585, + "step": 336 + }, + { + "advantage_max": 0.19050366338342428, + "advantage_mean": -3.123811228786244e-09, + "advantage_min": -0.19180483371019363, + "advantage_std": 0.1521349996328354, + "completion_length": 2244.2501068115234, + "epoch": 0.3851428571428571, + "grad_norm": 0.11284514516592026, + "kl": 0.034465789794921875, + "learning_rate": 3.612465628992203e-07, + "loss": 0.0136, + "reward": 0.12575469084549695, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1521350033581257, + "rewards/cosine_scaled_reward": 0.005518501624464989, + "rewards/format_reward": 0.7291666828095913, + "step": 337 + }, + { + "advantage_max": 0.19977085664868355, + "advantage_mean": -2.7755575615628914e-17, + "advantage_min": -0.1505418843589723, + "advantage_std": 0.14051949698477983, + "completion_length": 1987.5000457763672, + "epoch": 0.3862857142857143, + "grad_norm": 0.03124089166522026, + "kl": 0.03697967529296875, + "learning_rate": 3.5839931879571725e-07, + "loss": 0.005, + "reward": 0.13385723589453846, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.140519502107054, + "rewards/cosine_scaled_reward": 0.008122519589960575, + "rewards/format_reward": 0.7708333488553762, + "step": 338 + }, + { + "advantage_max": 0.11336069833487272, + "advantage_mean": 1.0089328331130965e-09, + "advantage_min": -0.10100190062075853, + "advantage_std": 0.0796776907518506, + "completion_length": 3045.750030517578, + "epoch": 0.38742857142857146, + "grad_norm": 0.03193638101220131, + "kl": 0.044342041015625, + "learning_rate": 3.555614130391079e-07, + "loss": 0.0068, + "reward": -0.012824157951399684, + "reward_advantage_correlation": 1.0, + "reward_std": 0.07967769308015704, + "rewards/cosine_scaled_reward": -0.1936829062178731, + "rewards/format_reward": 0.31250000558793545, + "step": 339 + }, + { + "advantage_max": 0.20058695320039988, + "advantage_mean": -4.113341493805489e-09, + "advantage_min": -0.18251327332109213, + "advantage_std": 0.15282507333904505, + "completion_length": 2754.979248046875, + "epoch": 0.38857142857142857, + "grad_norm": 0.07920674979686737, + "kl": 0.037872314453125, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0156, + "reward": 0.09624811878893524, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.15282507613301277, + "rewards/cosine_scaled_reward": 0.020154272206127644, + "rewards/format_reward": 0.5208333414047956, + "step": 340 + }, + { + "advantage_max": 0.13869278971105814, + "advantage_mean": -1.4435500156340098e-08, + "advantage_min": -0.15075519727542996, + "advantage_std": 0.12164947250857949, + "completion_length": 2472.0833892822266, + "epoch": 0.38971428571428574, + "grad_norm": 0.040600065141916275, + "kl": 0.038482666015625, + "learning_rate": 3.4991416936678276e-07, + "loss": 0.0081, + "reward": 0.18206297140568495, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12164947902783751, + "rewards/cosine_scaled_reward": 0.19754501432180405, + "rewards/format_reward": 0.6666666716337204, + "step": 341 + }, + { + "advantage_max": 0.18315967917442322, + "advantage_mean": -4.6566127342995145e-10, + "advantage_min": -0.16725237760692835, + "advantage_std": 0.14502713968977332, + "completion_length": 2800.604217529297, + "epoch": 0.39085714285714285, + "grad_norm": 0.06095492094755173, + "kl": 0.05242919921875, + "learning_rate": 3.471051066897562e-07, + "loss": 0.0124, + "reward": 0.03126559848897159, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.14502713968977332, + "rewards/cosine_scaled_reward": -0.09631808660924435, + "rewards/format_reward": 0.3750000074505806, + "step": 342 + }, + { + "advantage_max": 0.22337682452052832, + "advantage_mean": -8.614733849887646e-09, + "advantage_min": -0.18603284191340208, + "advantage_std": 0.16502254595980048, + "completion_length": 2473.2708892822266, + "epoch": 0.392, + "grad_norm": 0.03511941805481911, + "kl": 0.03363037109375, + "learning_rate": 3.4430593282358777e-07, + "loss": 0.0078, + "reward": 0.13036981271579862, + "reward_advantage_correlation": 1.0, + "reward_std": 0.16502255713567138, + "rewards/cosine_scaled_reward": 0.06001020688563585, + "rewards/format_reward": 0.6458333469927311, + "step": 343 + }, + { + "advantage_max": 0.1367392516694963, + "advantage_mean": -6.51925814720844e-09, + "advantage_min": -0.19531613495200872, + "advantage_std": 0.12819127598777413, + "completion_length": 2279.270881652832, + "epoch": 0.3931428571428571, + "grad_norm": 0.04424729198217392, + "kl": 0.026615142822265625, + "learning_rate": 3.4151678419606233e-07, + "loss": 0.013, + "reward": 0.16871812019962817, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12819127459079027, + "rewards/cosine_scaled_reward": 0.18393309600651264, + "rewards/format_reward": 0.6250000093132257, + "step": 344 + }, + { + "advantage_max": 0.13400619849562645, + "advantage_mean": 3.88051125954636e-10, + "advantage_min": -0.12482567969709635, + "advantage_std": 0.09939198894426227, + "completion_length": 2628.2709045410156, + "epoch": 0.3942857142857143, + "grad_norm": 0.055625367909669876, + "kl": 0.042591094970703125, + "learning_rate": 3.387377967463493e-07, + "loss": 0.004, + "reward": 0.04211104451678693, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.09939198987558484, + "rewards/cosine_scaled_reward": -0.09614112600684166, + "rewards/format_reward": 0.43750000558793545, + "step": 345 + }, + { + "advantage_max": 0.09226818988099694, + "advantage_mean": -5.587935725248627e-09, + "advantage_min": -0.09677781723439693, + "advantage_std": 0.07406530145090073, + "completion_length": 2657.937545776367, + "epoch": 0.3954285714285714, + "grad_norm": 0.037199534475803375, + "kl": 0.04526519775390625, + "learning_rate": 3.359691059183761e-07, + "loss": 0.007, + "reward": 0.07278500194661319, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.0740653061075136, + "rewards/cosine_scaled_reward": -0.046637315303087234, + "rewards/format_reward": 0.5208333414047956, + "step": 346 + }, + { + "advantage_max": 0.190785126760602, + "advantage_mean": -1.396983917434369e-09, + "advantage_min": -0.13941976055502892, + "advantage_std": 0.137190165463835, + "completion_length": 3063.8750610351562, + "epoch": 0.3965714285714286, + "grad_norm": 0.052545636892318726, + "kl": 0.04752349853515625, + "learning_rate": 3.3321084665422803e-07, + "loss": 0.0178, + "reward": 0.01757329748943448, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13719017151743174, + "rewards/cosine_scaled_reward": -0.14757848158478737, + "rewards/format_reward": 0.39583334140479565, + "step": 347 + }, + { + "advantage_max": 0.13676777854561806, + "advantage_mean": -9.701277940699082e-10, + "advantage_min": -0.13411255180835724, + "advantage_std": 0.10507096443325281, + "completion_length": 2446.2083740234375, + "epoch": 0.3977142857142857, + "grad_norm": 0.03185461834073067, + "kl": 0.04033660888671875, + "learning_rate": 3.3046315338757026e-07, + "loss": 0.0118, + "reward": 0.017525036178994924, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10507096629589796, + "rewards/cosine_scaled_reward": -0.1783681306988001, + "rewards/format_reward": 0.45833333767950535, + "step": 348 + }, + { + "advantage_max": 0.12805233104154468, + "advantage_mean": -5.820766091346741e-10, + "advantage_min": -0.13923298381268978, + "advantage_std": 0.10560231888666749, + "completion_length": 2235.6458892822266, + "epoch": 0.39885714285714285, + "grad_norm": 0.08265335857868195, + "kl": 0.0382232666015625, + "learning_rate": 3.2772616003709616e-07, + "loss": 0.0176, + "reward": 0.107158649538178, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.10560232121497393, + "rewards/cosine_scaled_reward": 0.03427761234343052, + "rewards/format_reward": 0.5625000037252903, + "step": 349 + }, + { + "advantage_max": 0.1867425860837102, + "advantage_mean": -2.095475848395978e-09, + "advantage_min": -0.13708714861422777, + "advantage_std": 0.1264267978258431, + "completion_length": 1768.7500762939453, + "epoch": 0.4, + "grad_norm": 0.054677292704582214, + "kl": 0.02910614013671875, + "learning_rate": 3.250000000000001e-07, + "loss": 0.0072, + "reward": 0.09746129438281059, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12642680434510112, + "rewards/cosine_scaled_reward": -0.1304661799222231, + "rewards/format_reward": 0.8333333395421505, + "step": 350 + }, + { + "advantage_max": 0.16754774982109666, + "advantage_mean": 6.984919309616089e-10, + "advantage_min": -0.17636614479124546, + "advantage_std": 0.1403536181896925, + "completion_length": 2874.229202270508, + "epoch": 0.40114285714285713, + "grad_norm": 0.04100406542420387, + "kl": 0.057098388671875, + "learning_rate": 3.222848061454764e-07, + "loss": 0.0149, + "reward": 0.08200124464929104, + "reward_advantage_correlation": 1.0, + "reward_std": 0.14035362238064408, + "rewards/cosine_scaled_reward": 0.011260326951742172, + "rewards/format_reward": 0.4583333432674408, + "step": 351 + }, + { + "advantage_max": 0.19811256416141987, + "advantage_mean": -4.9670539625790155e-09, + "advantage_min": -0.14627101365476847, + "advantage_std": 0.1381764942780137, + "completion_length": 2160.687530517578, + "epoch": 0.4022857142857143, + "grad_norm": 0.07493390142917633, + "kl": 0.02725982666015625, + "learning_rate": 3.195807108082429e-07, + "loss": 0.0139, + "reward": 0.09585971757769585, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13817649986594915, + "rewards/cosine_scaled_reward": 0.03348735300824046, + "rewards/format_reward": 0.5000000167638063, + "step": 352 + }, + { + "advantage_max": 0.22151278518140316, + "advantage_mean": -3.6476802342533254e-09, + "advantage_min": -0.1870957650244236, + "advantage_std": 0.16393206408247352, + "completion_length": 1902.333381652832, + "epoch": 0.4034285714285714, + "grad_norm": 0.04224457964301109, + "kl": 0.0423431396484375, + "learning_rate": 3.168878457820915e-07, + "loss": 0.0074, + "reward": 0.1670964928343892, + "reward_advantage_correlation": 1.0, + "reward_std": 0.16393207339569926, + "rewards/cosine_scaled_reward": 0.09996501728892326, + "rewards/format_reward": 0.7708333488553762, + "step": 353 + }, + { + "advantage_max": 0.1349656144157052, + "advantage_mean": -8.226683126388856e-09, + "advantage_min": -0.1224656468257308, + "advantage_std": 0.10073514329269528, + "completion_length": 1743.6042404174805, + "epoch": 0.4045714285714286, + "grad_norm": 0.03602663427591324, + "kl": 0.036907196044921875, + "learning_rate": 3.142063423134644e-07, + "loss": 0.0109, + "reward": 0.14471303531900048, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10073514748364687, + "rewards/cosine_scaled_reward": 0.057613649405539036, + "rewards/format_reward": 0.7291666753590107, + "step": 354 + }, + { + "advantage_max": 0.199096888769418, + "advantage_mean": -3.802900647165153e-09, + "advantage_min": -0.23116095550358295, + "advantage_std": 0.17236013431102037, + "completion_length": 1989.2292175292969, + "epoch": 0.4057142857142857, + "grad_norm": 0.05271374434232712, + "kl": 0.044979095458984375, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0004, + "reward": 0.1605138722807169, + "reward_advantage_correlation": 1.0, + "reward_std": 0.17236013896763325, + "rewards/cosine_scaled_reward": 0.10444843280129135, + "rewards/format_reward": 0.7291666753590107, + "step": 355 + }, + { + "advantage_max": 0.21099152276292443, + "advantage_mean": 1.164153294597181e-09, + "advantage_min": -0.17593623790889978, + "advantage_std": 0.16403738921508193, + "completion_length": 2499.9583740234375, + "epoch": 0.40685714285714286, + "grad_norm": 0.0717054158449173, + "kl": 0.043792724609375, + "learning_rate": 3.0887794225945143e-07, + "loss": -0.0039, + "reward": 0.12240747502073646, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.16403739666566253, + "rewards/cosine_scaled_reward": 0.027360030449926853, + "rewards/format_reward": 0.6666666772216558, + "step": 356 + }, + { + "advantage_max": 0.2026348551735282, + "advantage_mean": -2.3283064226609085e-09, + "advantage_min": -0.15383625030517578, + "advantage_std": 0.14354443550109863, + "completion_length": 2546.270881652832, + "epoch": 0.408, + "grad_norm": 0.07251500338315964, + "kl": 0.040679931640625, + "learning_rate": 3.062313053727671e-07, + "loss": 0.0075, + "reward": 0.06861508125439286, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.14354444295167923, + "rewards/cosine_scaled_reward": -0.06084787752479315, + "rewards/format_reward": 0.5208333432674408, + "step": 357 + }, + { + "advantage_max": 0.14043398201465607, + "advantage_mean": -9.778887422040583e-09, + "advantage_min": -0.20001309178769588, + "advantage_std": 0.13735910970717669, + "completion_length": 1679.8750610351562, + "epoch": 0.40914285714285714, + "grad_norm": 0.044725243002176285, + "kl": 0.018812179565429688, + "learning_rate": 3.0359654942835247e-07, + "loss": 0.007, + "reward": 0.19390923529863358, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13735911808907986, + "rewards/cosine_scaled_reward": 0.18609214387834072, + "rewards/format_reward": 0.7708333358168602, + "step": 358 + }, + { + "advantage_max": 0.1749684326350689, + "advantage_mean": -4.967053976456803e-09, + "advantage_min": -0.16937719751149416, + "advantage_std": 0.14141156151890755, + "completion_length": 2299.6459197998047, + "epoch": 0.4102857142857143, + "grad_norm": 0.06835544854402542, + "kl": 0.0594482421875, + "learning_rate": 3.0097380284049523e-07, + "loss": 0.0138, + "reward": 0.08632952661719173, + "reward_advantage_correlation": 1.0, + "reward_std": 0.141411567106843, + "rewards/cosine_scaled_reward": -0.05791237950325012, + "rewards/format_reward": 0.6250000186264515, + "step": 359 + }, + { + "advantage_max": 0.1915941759943962, + "advantage_mean": 1.3969838619232178e-09, + "advantage_min": -0.1723188515752554, + "advantage_std": 0.14488590229302645, + "completion_length": 2417.229217529297, + "epoch": 0.4114285714285714, + "grad_norm": 0.05922839418053627, + "kl": 0.04900360107421875, + "learning_rate": 2.9836319343816397e-07, + "loss": 0.0139, + "reward": 0.08855638474415173, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.14488590601831675, + "rewards/cosine_scaled_reward": -0.028948407620191574, + "rewards/format_reward": 0.5833333507180214, + "step": 360 + }, + { + "advantage_max": 0.2224835902452469, + "advantage_mean": -4.96705380992335e-09, + "advantage_min": -0.24620150960981846, + "advantage_std": 0.1885995902121067, + "completion_length": 2589.166748046875, + "epoch": 0.4125714285714286, + "grad_norm": 0.0959998071193695, + "kl": 0.05340576171875, + "learning_rate": 2.9576484845877793e-07, + "loss": 0.0184, + "reward": 0.08691775565966964, + "reward_advantage_correlation": 0.9999999999999996, + "reward_std": 0.18859959580004215, + "rewards/cosine_scaled_reward": -0.005889172665774822, + "rewards/format_reward": 0.5208333432674408, + "step": 361 + }, + { + "advantage_max": 0.08507681265473366, + "advantage_mean": -3.7252902568285506e-09, + "advantage_min": -0.12208179384469986, + "advantage_std": 0.09099588170647621, + "completion_length": 1447.708381652832, + "epoch": 0.4137142857142857, + "grad_norm": 0.05355558171868324, + "kl": 0.04192352294921875, + "learning_rate": 2.931788945420058e-07, + "loss": 0.0133, + "reward": 0.17968237926834263, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.09099588077515364, + "rewards/cosine_scaled_reward": 0.12263097055256367, + "rewards/format_reward": 0.8125000111758709, + "step": 362 + }, + { + "advantage_max": 0.1842280002310872, + "advantage_mean": -6.9073092315297124e-09, + "advantage_min": -0.15253216493874788, + "advantage_std": 0.1245655040256679, + "completion_length": 1517.458396911621, + "epoch": 0.41485714285714287, + "grad_norm": 0.04148838296532631, + "kl": 0.043212890625, + "learning_rate": 2.9060545772359305e-07, + "loss": 0.0098, + "reward": 0.17585814488120377, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12456551007926464, + "rewards/cosine_scaled_reward": 0.11942176637239754, + "rewards/format_reward": 0.791666679084301, + "step": 363 + }, + { + "advantage_max": 0.13093959633260965, + "advantage_mean": 2.9103831254706503e-09, + "advantage_min": -0.1537663722410798, + "advantage_std": 0.11034792196005583, + "completion_length": 2274.2500610351562, + "epoch": 0.416, + "grad_norm": 0.07935275882482529, + "kl": 0.04637908935546875, + "learning_rate": 2.8804466342921987e-07, + "loss": 0.0065, + "reward": 0.038603525958023965, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1103479228913784, + "rewards/cosine_scaled_reward": -0.17720217071473598, + "rewards/format_reward": 0.5833333432674408, + "step": 364 + }, + { + "advantage_max": 0.1946524642407894, + "advantage_mean": -3.802900563898426e-09, + "advantage_min": -0.17981757363304496, + "advantage_std": 0.1640670644119382, + "completion_length": 2797.6667098999023, + "epoch": 0.41714285714285715, + "grad_norm": 0.10382858663797379, + "kl": 0.06769561767578125, + "learning_rate": 2.854966364683872e-07, + "loss": 0.016, + "reward": 0.07118311786325648, + "reward_advantage_correlation": 1.0, + "reward_std": 0.16406708303838968, + "rewards/cosine_scaled_reward": -0.0007578474469482899, + "rewards/format_reward": 0.41666667349636555, + "step": 365 + }, + { + "advantage_max": 0.13503147196024656, + "advantage_mean": -5.432715111108877e-10, + "advantage_min": -0.13952044351026416, + "advantage_std": 0.1080038407817483, + "completion_length": 1690.4375267028809, + "epoch": 0.41828571428571426, + "grad_norm": 0.03620801120996475, + "kl": 0.02330780029296875, + "learning_rate": 2.829615010283344e-07, + "loss": -0.001, + "reward": 0.1814038148149848, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1080038477666676, + "rewards/cosine_scaled_reward": 0.09267531940713525, + "rewards/format_reward": 0.875, + "step": 366 + }, + { + "advantage_max": 0.26444832887500525, + "advantage_mean": -3.4148495420271985e-09, + "advantage_min": -0.1704773958772421, + "advantage_std": 0.16923782834783196, + "completion_length": 2643.2292098999023, + "epoch": 0.41942857142857143, + "grad_norm": 0.07513663917779922, + "kl": 0.046604156494140625, + "learning_rate": 2.8043938066798645e-07, + "loss": 0.0162, + "reward": 0.06340029306011274, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.16923783347010612, + "rewards/cosine_scaled_reward": -0.08535546949133277, + "rewards/format_reward": 0.5416666734963655, + "step": 367 + }, + { + "advantage_max": 0.15277679590508342, + "advantage_mean": 3.725290367850853e-09, + "advantage_min": -0.1591153903864324, + "advantage_std": 0.12118267058394849, + "completion_length": 2796.979217529297, + "epoch": 0.4205714285714286, + "grad_norm": 0.06917224824428558, + "kl": 0.0589141845703125, + "learning_rate": 2.7793039831193133e-07, + "loss": 0.0198, + "reward": 0.015495523664867505, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12118267104960978, + "rewards/cosine_scaled_reward": -0.14332137536257505, + "rewards/format_reward": 0.37500000931322575, + "step": 368 + }, + { + "advantage_max": 0.15400357451289892, + "advantage_mean": -2.7939677238464355e-09, + "advantage_min": -0.2148400265723467, + "advantage_std": 0.1464338074438274, + "completion_length": 2664.6250610351562, + "epoch": 0.4217142857142857, + "grad_norm": 0.12221620231866837, + "kl": 0.0754241943359375, + "learning_rate": 2.7543467624442956e-07, + "loss": 0.0104, + "reward": 0.10048398980870843, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1464338069781661, + "rewards/cosine_scaled_reward": 0.05717929266393185, + "rewards/format_reward": 0.4791666828095913, + "step": 369 + }, + { + "advantage_max": 0.14356589503586292, + "advantage_mean": -4.190951551075184e-09, + "advantage_min": -0.14298985060304403, + "advantage_std": 0.11907922197133303, + "completion_length": 2785.416702270508, + "epoch": 0.4228571428571429, + "grad_norm": 0.05613729730248451, + "kl": 0.06988525390625, + "learning_rate": 2.729523361034538e-07, + "loss": 0.0162, + "reward": 0.07710259314626455, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.11907922383397818, + "rewards/cosine_scaled_reward": 0.00833977572619915, + "rewards/format_reward": 0.43750000931322575, + "step": 370 + }, + { + "advantage_max": 0.15934794954955578, + "advantage_mean": -5.044664012909816e-09, + "advantage_min": -0.14198165433481336, + "advantage_std": 0.11952129052951932, + "completion_length": 1526.4583587646484, + "epoch": 0.424, + "grad_norm": 0.05127997323870659, + "kl": 0.04236602783203125, + "learning_rate": 2.7048349887476037e-07, + "loss": 0.0102, + "reward": 0.16448132740333676, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11952129052951932, + "rewards/cosine_scaled_reward": 0.11803595721721649, + "rewards/format_reward": 0.7291666753590107, + "step": 371 + }, + { + "advantage_max": 0.22729028388857841, + "advantage_mean": 1.2417634420724966e-09, + "advantage_min": -0.15133884362876415, + "advantage_std": 0.14886979572474957, + "completion_length": 2749.7917404174805, + "epoch": 0.42514285714285716, + "grad_norm": 0.10154866427183151, + "kl": 0.0570220947265625, + "learning_rate": 2.6802828488599294e-07, + "loss": 0.0161, + "reward": 0.0784059870056808, + "reward_advantage_correlation": 1.0, + "reward_std": 0.14886980084702373, + "rewards/cosine_scaled_reward": 0.010133292176760733, + "rewards/format_reward": 0.4375000037252903, + "step": 372 + }, + { + "advantage_max": 0.15681373560801148, + "advantage_mean": -1.0399768823887712e-08, + "advantage_min": -0.1237130188383162, + "advantage_std": 0.11755135306157172, + "completion_length": 1514.3958625793457, + "epoch": 0.42628571428571427, + "grad_norm": 0.032398343086242676, + "kl": 0.034709930419921875, + "learning_rate": 2.655868138008171e-07, + "loss": 0.0063, + "reward": 0.08641793858259916, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11755135981366038, + "rewards/cosine_scaled_reward": -0.11321200663223863, + "rewards/format_reward": 0.7291666772216558, + "step": 373 + }, + { + "advantage_max": 0.1916589979082346, + "advantage_mean": -4.967053740534411e-09, + "advantage_min": -0.24126344360411167, + "advantage_std": 0.17274773959070444, + "completion_length": 2046.4375762939453, + "epoch": 0.42742857142857144, + "grad_norm": 0.07882247865200043, + "kl": 0.0540771484375, + "learning_rate": 2.631592046130896e-07, + "loss": 0.0185, + "reward": 0.12403235118836164, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1727477479726076, + "rewards/cosine_scaled_reward": 0.09565928019583225, + "rewards/format_reward": 0.5416666828095913, + "step": 374 + }, + { + "advantage_max": 0.13977034855633974, + "advantage_mean": -8.459513499425864e-09, + "advantage_min": -0.21618898399174213, + "advantage_std": 0.14144186303019524, + "completion_length": 2736.4375610351562, + "epoch": 0.42857142857142855, + "grad_norm": 0.13247309625148773, + "kl": 0.079193115234375, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0266, + "reward": 0.07485455530695617, + "reward_advantage_correlation": 1.0, + "reward_std": 0.14144186628982425, + "rewards/cosine_scaled_reward": 0.022881068289279938, + "rewards/format_reward": 0.3958333507180214, + "step": 375 + }, + { + "advantage_max": 0.2252150783315301, + "advantage_mean": 1.249000902703301e-16, + "advantage_min": -0.15643711481243372, + "advantage_std": 0.15819351840764284, + "completion_length": 2360.354248046875, + "epoch": 0.4297142857142857, + "grad_norm": 0.08375173062086105, + "kl": 0.06674385070800781, + "learning_rate": 2.583460445215911e-07, + "loss": 0.0122, + "reward": 0.061880006454885006, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.15819351840764284, + "rewards/cosine_scaled_reward": -0.11892816657200456, + "rewards/format_reward": 0.6041666734963655, + "step": 376 + }, + { + "advantage_max": 0.2330569690093398, + "advantage_mean": 2.6387473733846534e-09, + "advantage_min": -0.16961478628218174, + "advantage_std": 0.15390967670828104, + "completion_length": 2938.9584350585938, + "epoch": 0.4308571428571429, + "grad_norm": 0.08977462351322174, + "kl": 0.1001129150390625, + "learning_rate": 2.5596072820445254e-07, + "loss": 0.0238, + "reward": 0.04838086655945517, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1539096813648939, + "rewards/cosine_scaled_reward": -0.03433333709836006, + "rewards/format_reward": 0.3541666753590107, + "step": 377 + }, + { + "advantage_max": 0.23671884834766388, + "advantage_mean": -1.396983917434369e-09, + "advantage_min": -0.1725560138002038, + "advantage_std": 0.15941207576543093, + "completion_length": 2238.645881652832, + "epoch": 0.432, + "grad_norm": 0.054851531982421875, + "kl": 0.06591415405273438, + "learning_rate": 2.5358974294659373e-07, + "loss": 0.0141, + "reward": 0.09857236547395587, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.15941207949072123, + "rewards/cosine_scaled_reward": 0.007719084620475769, + "rewards/format_reward": 0.5625000055879354, + "step": 378 + }, + { + "advantage_max": 0.2065592324361205, + "advantage_mean": 6.208817210362483e-10, + "advantage_min": -0.1892344020307064, + "advantage_std": 0.14801982045173645, + "completion_length": 2359.416717529297, + "epoch": 0.43314285714285716, + "grad_norm": 0.0550236701965332, + "kl": 0.06499481201171875, + "learning_rate": 2.512332043064913e-07, + "loss": 0.0143, + "reward": 0.08630741806700826, + "reward_advantage_correlation": 1.0, + "reward_std": 0.14801982697099447, + "rewards/cosine_scaled_reward": -0.05953240022063255, + "rewards/format_reward": 0.625000013038516, + "step": 379 + }, + { + "advantage_max": 0.11885185819119215, + "advantage_mean": -3.2596290944209017e-09, + "advantage_min": -0.15346426516771317, + "advantage_std": 0.11005022726021707, + "completion_length": 2270.250045776367, + "epoch": 0.4342857142857143, + "grad_norm": 0.06920424103736877, + "kl": 0.06329345703125, + "learning_rate": 2.488912271385139e-07, + "loss": 0.0113, + "reward": 0.09566288208588958, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11005022516474128, + "rewards/cosine_scaled_reward": 0.010071046184748411, + "rewards/format_reward": 0.5416666772216558, + "step": 380 + }, + { + "advantage_max": 0.16205658204853535, + "advantage_mean": 2.0372682898311956e-09, + "advantage_min": -0.10958120739087462, + "advantage_std": 0.10907484404742718, + "completion_length": 2616.7500610351562, + "epoch": 0.43542857142857144, + "grad_norm": 0.051583774387836456, + "kl": 0.081939697265625, + "learning_rate": 2.465639255873246e-07, + "loss": 0.0113, + "reward": -0.01113765970512759, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10907484823837876, + "rewards/cosine_scaled_reward": -0.2404602374881506, + "rewards/format_reward": 0.4166666753590107, + "step": 381 + }, + { + "advantage_max": 0.18154283426702023, + "advantage_mean": -2.716357583310014e-09, + "advantage_min": -0.14717091992497444, + "advantage_std": 0.12136137438938022, + "completion_length": 1677.3125610351562, + "epoch": 0.43657142857142855, + "grad_norm": 0.04167007654905319, + "kl": 0.0534820556640625, + "learning_rate": 2.4425141308231765e-07, + "loss": 0.0093, + "reward": 0.0900630738469772, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12136137904599309, + "rewards/cosine_scaled_reward": -0.14255204549408518, + "rewards/format_reward": 0.8125000204890966, + "step": 382 + }, + { + "advantage_max": 0.16546936659142375, + "advantage_mean": -2.056670778127767e-09, + "advantage_min": -0.1402215976268053, + "advantage_std": 0.12318957853130996, + "completion_length": 2511.020881652832, + "epoch": 0.4377142857142857, + "grad_norm": 0.10153740644454956, + "kl": 0.08424758911132812, + "learning_rate": 2.4195380233209006e-07, + "loss": 0.012, + "reward": 0.07365681836381555, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1231895792298019, + "rewards/cosine_scaled_reward": -0.03396464860998094, + "rewards/format_reward": 0.500000013038516, + "step": 383 + }, + { + "advantage_max": 0.26926819048821926, + "advantage_mean": -6.519258133330652e-09, + "advantage_min": -0.18919609300792217, + "advantage_std": 0.18077142210677266, + "completion_length": 1919.895896911621, + "epoch": 0.43885714285714283, + "grad_norm": 0.0882238820195198, + "kl": 0.073150634765625, + "learning_rate": 2.3967120531894857e-07, + "loss": 0.0167, + "reward": 0.20658226870000362, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1807714276947081, + "rewards/cosine_scaled_reward": 0.285074929241091, + "rewards/format_reward": 0.6458333414047956, + "step": 384 + }, + { + "advantage_max": 0.17060526320710778, + "advantage_mean": -1.202958357926498e-09, + "advantage_min": -0.12168563972227275, + "advantage_std": 0.12083742453251034, + "completion_length": 2190.6667251586914, + "epoch": 0.44, + "grad_norm": 0.03899059444665909, + "kl": 0.06510162353515625, + "learning_rate": 2.374037332934512e-07, + "loss": 0.0125, + "reward": 0.06560673611238599, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12083742849063128, + "rewards/cosine_scaled_reward": -0.1208215095102787, + "rewards/format_reward": 0.6250000037252903, + "step": 385 + }, + { + "advantage_max": 0.19935980439186096, + "advantage_mean": -4.5013925920045494e-09, + "advantage_min": -0.2209212351590395, + "advantage_std": 0.16272395756095648, + "completion_length": 2633.6250534057617, + "epoch": 0.44114285714285717, + "grad_norm": 0.115007184445858, + "kl": 0.0880279541015625, + "learning_rate": 2.3515149676898552e-07, + "loss": 0.0139, + "reward": 0.09591761871706694, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1627239640802145, + "rewards/cosine_scaled_reward": -0.011030611349269748, + "rewards/format_reward": 0.5833333488553762, + "step": 386 + }, + { + "advantage_max": 0.15378239285200834, + "advantage_mean": -2.2506962543689113e-09, + "advantage_min": -0.14221325889229774, + "advantage_std": 0.123248225543648, + "completion_length": 2798.7708740234375, + "epoch": 0.4422857142857143, + "grad_norm": 0.05777512118220329, + "kl": 0.1002044677734375, + "learning_rate": 2.3291460551638237e-07, + "loss": 0.02, + "reward": 0.05685883387923241, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12324822414666414, + "rewards/cosine_scaled_reward": -0.03132744878530502, + "rewards/format_reward": 0.3958333358168602, + "step": 387 + }, + { + "advantage_max": 0.17052914388477802, + "advantage_mean": -2.173086155465853e-09, + "advantage_min": -0.13332293508574367, + "advantage_std": 0.12420817371457815, + "completion_length": 2340.6250610351562, + "epoch": 0.44342857142857145, + "grad_norm": 0.04883037880063057, + "kl": 0.093536376953125, + "learning_rate": 2.306931685585657e-07, + "loss": 0.0165, + "reward": 0.11339898826554418, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1242081755772233, + "rewards/cosine_scaled_reward": 0.06429434940218925, + "rewards/format_reward": 0.5416666679084301, + "step": 388 + }, + { + "advantage_max": 0.120490204077214, + "advantage_mean": -2.7939677169075416e-09, + "advantage_min": -0.16002231976017356, + "advantage_std": 0.10619163559749722, + "completion_length": 1919.5833740234375, + "epoch": 0.44457142857142856, + "grad_norm": 0.06259764730930328, + "kl": 0.06043243408203125, + "learning_rate": 2.2848729416523859e-07, + "loss": 0.0032, + "reward": 0.08577554707881063, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.10619163932278752, + "rewards/cosine_scaled_reward": -0.10086721181869507, + "rewards/format_reward": 0.7083333507180214, + "step": 389 + }, + { + "advantage_max": 0.18258788716048002, + "advantage_mean": -2.3283064365386963e-09, + "advantage_min": -0.13553531654179096, + "advantage_std": 0.1262248894199729, + "completion_length": 2658.9375534057617, + "epoch": 0.44571428571428573, + "grad_norm": 0.12601332366466522, + "kl": 0.0760498046875, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.0039, + "reward": 0.05920348968356848, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.12622489035129547, + "rewards/cosine_scaled_reward": -0.03534679673612118, + "rewards/format_reward": 0.4166666679084301, + "step": 390 + }, + { + "advantage_max": 0.17521672742441297, + "advantage_mean": -5.122274354674161e-09, + "advantage_min": -0.19580721389502287, + "advantage_std": 0.15704865427687764, + "completion_length": 2380.625030517578, + "epoch": 0.44685714285714284, + "grad_norm": 0.12029723823070526, + "kl": 0.07598876953125, + "learning_rate": 2.2412266235313973e-07, + "loss": 0.0076, + "reward": 0.13599346484988928, + "reward_advantage_correlation": 1.0, + "reward_std": 0.15704865800216794, + "rewards/cosine_scaled_reward": 0.12639703415334225, + "rewards/format_reward": 0.5416666697710752, + "step": 391 + }, + { + "advantage_max": 0.12857118248939514, + "advantage_mean": -6.984920281061235e-10, + "advantage_min": -0.1260515470057726, + "advantage_std": 0.10433831717818975, + "completion_length": 1794.0000457763672, + "epoch": 0.448, + "grad_norm": 0.03922954574227333, + "kl": 0.0439300537109375, + "learning_rate": 2.2196411766036487e-07, + "loss": 0.0043, + "reward": 0.09477179404348135, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.10433832183480263, + "rewards/cosine_scaled_reward": -0.12779118958860636, + "rewards/format_reward": 0.8125000074505806, + "step": 392 + }, + { + "advantage_max": 0.23233003355562687, + "advantage_mean": 2.0178656245928295e-09, + "advantage_min": -0.18882656935602427, + "advantage_std": 0.16627767169848084, + "completion_length": 2182.2708587646484, + "epoch": 0.4491428571428571, + "grad_norm": 0.07548714429140091, + "kl": 0.0761871337890625, + "learning_rate": 2.1982156097370557e-07, + "loss": 0.0054, + "reward": 0.08290141774341464, + "reward_advantage_correlation": 1.0, + "reward_std": 0.16627767169848084, + "rewards/cosine_scaled_reward": -0.05863434381899424, + "rewards/format_reward": 0.604166679084301, + "step": 393 + }, + { + "advantage_max": 0.1498968666419387, + "advantage_mean": 2.250696282124487e-09, + "advantage_min": -0.10414962749928236, + "advantage_std": 0.09691937500610948, + "completion_length": 2409.666763305664, + "epoch": 0.4502857142857143, + "grad_norm": 0.057605672627687454, + "kl": 0.0919647216796875, + "learning_rate": 2.1769509671835223e-07, + "loss": 0.0121, + "reward": -0.004766212543472648, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.09691937174648046, + "rewards/cosine_scaled_reward": -0.2431314792484045, + "rewards/format_reward": 0.45833333767950535, + "step": 394 + }, + { + "advantage_max": 0.11893777782097459, + "advantage_mean": 1.4745940718485784e-09, + "advantage_min": -0.06753726582974195, + "advantage_std": 0.07489508436992764, + "completion_length": 2225.0208435058594, + "epoch": 0.4514285714285714, + "grad_norm": 0.07177528738975525, + "kl": 0.082550048828125, + "learning_rate": 2.1558482853517253e-07, + "loss": 0.007, + "reward": 0.05327743641100824, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.07489508809521794, + "rewards/cosine_scaled_reward": -0.10739827249199152, + "rewards/format_reward": 0.520833333954215, + "step": 395 + }, + { + "advantage_max": 0.17730457615107298, + "advantage_mean": 0.0, + "advantage_min": -0.20803257264196873, + "advantage_std": 0.1546328696422279, + "completion_length": 2330.9166946411133, + "epoch": 0.45257142857142857, + "grad_norm": 0.06790260970592499, + "kl": 0.09099960327148438, + "learning_rate": 2.134908592756607e-07, + "loss": 0.018, + "reward": 0.07746678782859817, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.15463287895545363, + "rewards/cosine_scaled_reward": -0.023853310383856297, + "rewards/format_reward": 0.5000000111758709, + "step": 396 + }, + { + "advantage_max": 0.2035736571997404, + "advantage_mean": -1.5522043442239841e-09, + "advantage_min": -0.13746324321255088, + "advantage_std": 0.13648039917461574, + "completion_length": 2338.104263305664, + "epoch": 0.45371428571428574, + "grad_norm": 0.06633574515581131, + "kl": 0.0894622802734375, + "learning_rate": 2.1141329099692406e-07, + "loss": 0.0108, + "reward": 0.06513352657202631, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13648040336556733, + "rewards/cosine_scaled_reward": -0.0791270025074482, + "rewards/format_reward": 0.5416666716337204, + "step": 397 + }, + { + "advantage_max": 0.19265004387125373, + "advantage_mean": -2.3283065475609988e-09, + "advantage_min": -0.15929417125880718, + "advantage_std": 0.13376712054014206, + "completion_length": 2583.8750534057617, + "epoch": 0.45485714285714285, + "grad_norm": 0.06692863255739212, + "kl": 0.09836196899414062, + "learning_rate": 2.0935222495670968e-07, + "loss": 0.0099, + "reward": 0.07285916851833463, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1337671286892146, + "rewards/cosine_scaled_reward": -0.06691717123612761, + "rewards/format_reward": 0.562500013038516, + "step": 398 + }, + { + "advantage_max": 0.1806424199603498, + "advantage_mean": 3.414849639171713e-09, + "advantage_min": -0.16816816572099924, + "advantage_std": 0.154349563177675, + "completion_length": 2301.1458892822266, + "epoch": 0.456, + "grad_norm": 0.1409112811088562, + "kl": 0.06836700439453125, + "learning_rate": 2.0730776160846853e-07, + "loss": 0.0188, + "reward": 0.12668270588619635, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.15434956131502986, + "rewards/cosine_scaled_reward": 0.029449453577399254, + "rewards/format_reward": 0.6875000149011612, + "step": 399 + }, + { + "advantage_max": 0.18242146540433168, + "advantage_mean": 3.570069934122344e-09, + "advantage_min": -0.15212072804570198, + "advantage_std": 0.1341659629251808, + "completion_length": 1446.437515258789, + "epoch": 0.45714285714285713, + "grad_norm": 0.04043472185730934, + "kl": 0.0493316650390625, + "learning_rate": 2.0528000059645995e-07, + "loss": 0.0073, + "reward": 0.16493975650519133, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13416596641764045, + "rewards/cosine_scaled_reward": 0.08686932059936225, + "rewards/format_reward": 0.7916666716337204, + "step": 400 + }, + { + "advantage_max": 0.11342965112999082, + "advantage_mean": -7.140140007022566e-09, + "advantage_min": -0.1604298735037446, + "advantage_std": 0.10300690727308393, + "completion_length": 2901.4791717529297, + "epoch": 0.4582857142857143, + "grad_norm": 0.08971995860338211, + "kl": 0.09316825866699219, + "learning_rate": 2.032690407508949e-07, + "loss": 0.0122, + "reward": 0.05048931506462395, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1030069119296968, + "rewards/cosine_scaled_reward": -0.05228261277079582, + "rewards/format_reward": 0.39583333767950535, + "step": 401 + }, + { + "advantage_max": 0.12064083246514201, + "advantage_mean": -1.319373665875645e-09, + "advantage_min": -0.10163196362555027, + "advantage_std": 0.09164711134508252, + "completion_length": 2343.270866394043, + "epoch": 0.4594285714285714, + "grad_norm": 0.08057854324579239, + "kl": 0.0913543701171875, + "learning_rate": 2.0127498008311922e-07, + "loss": 0.009, + "reward": 0.06032170820981264, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.09164711693301797, + "rewards/cosine_scaled_reward": -0.08369093760848045, + "rewards/format_reward": 0.5208333376795053, + "step": 402 + }, + { + "advantage_max": 0.08787010563537478, + "advantage_mean": 1.3969838896787934e-09, + "advantage_min": -0.1497042439877987, + "advantage_std": 0.09532873774878681, + "completion_length": 1638.4791793823242, + "epoch": 0.4605714285714286, + "grad_norm": 0.03606126457452774, + "kl": 0.042949676513671875, + "learning_rate": 1.9929791578083655e-07, + "loss": 0.0072, + "reward": 0.11972164455801249, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09532873728312552, + "rewards/cosine_scaled_reward": -0.0013210475444793701, + "rewards/format_reward": 0.7083333432674408, + "step": 403 + }, + { + "advantage_max": 0.12971502542495728, + "advantage_mean": -4.8118334594615675e-09, + "advantage_min": -0.1479872101917863, + "advantage_std": 0.10552329616621137, + "completion_length": 1950.8542175292969, + "epoch": 0.4617142857142857, + "grad_norm": 0.13382993638515472, + "kl": 0.07288742065429688, + "learning_rate": 1.9733794420337213e-07, + "loss": 0.0012, + "reward": 0.1100343014113605, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.10552330128848553, + "rewards/cosine_scaled_reward": -0.013227662071585655, + "rewards/format_reward": 0.6666666697710752, + "step": 404 + }, + { + "advantage_max": 0.1491077565588057, + "advantage_mean": -6.364038018791263e-09, + "advantage_min": -0.13719645235687494, + "advantage_std": 0.12250480009242892, + "completion_length": 1873.125015258789, + "epoch": 0.46285714285714286, + "grad_norm": 0.16895225644111633, + "kl": 0.057651519775390625, + "learning_rate": 1.9539516087697517e-07, + "loss": 0.0009, + "reward": 0.17472200049087405, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12250480428338051, + "rewards/cosine_scaled_reward": 0.1878266427665949, + "rewards/format_reward": 0.6458333395421505, + "step": 405 + }, + { + "advantage_max": 0.24610369745641947, + "advantage_mean": -1.474594175931987e-09, + "advantage_min": -0.16500617330893874, + "advantage_std": 0.17077338322997093, + "completion_length": 2001.6875457763672, + "epoch": 0.464, + "grad_norm": 0.037415359169244766, + "kl": 0.05828857421875, + "learning_rate": 1.934696604901642e-07, + "loss": 0.0095, + "reward": 0.13929871143773198, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1707733916118741, + "rewards/cosine_scaled_reward": 0.01136303087696433, + "rewards/format_reward": 0.7916666753590107, + "step": 406 + }, + { + "advantage_max": 0.10421117953956127, + "advantage_mean": -5.432716707054475e-10, + "advantage_min": -0.10994730168022215, + "advantage_std": 0.08500955649651587, + "completion_length": 2364.541702270508, + "epoch": 0.46514285714285714, + "grad_norm": 0.06763742864131927, + "kl": 0.05281829833984375, + "learning_rate": 1.915615368891117e-07, + "loss": 0.0026, + "reward": 0.1408998296046775, + "reward_advantage_correlation": 1.0, + "reward_std": 0.0850095555651933, + "rewards/cosine_scaled_reward": 0.10000266879796982, + "rewards/format_reward": 0.6250000055879354, + "step": 407 + }, + { + "advantage_max": 0.13665086403489113, + "advantage_mean": -6.364037768991082e-09, + "advantage_min": -0.12718658475205302, + "advantage_std": 0.10699888691306114, + "completion_length": 1904.4583587646484, + "epoch": 0.4662857142857143, + "grad_norm": 0.04600901156663895, + "kl": 0.04422760009765625, + "learning_rate": 1.8967088307307e-07, + "loss": 0.0046, + "reward": 0.15510824089869857, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10699889156967402, + "rewards/cosine_scaled_reward": 0.12226809374988079, + "rewards/format_reward": 0.666666679084301, + "step": 408 + }, + { + "advantage_max": 0.21366258803755045, + "advantage_mean": -1.5522042678961512e-09, + "advantage_min": -0.1656627543270588, + "advantage_std": 0.1524591613560915, + "completion_length": 3079.3750915527344, + "epoch": 0.4674285714285714, + "grad_norm": 0.1300925612449646, + "kl": 0.0786285400390625, + "learning_rate": 1.8779779118983867e-07, + "loss": 0.0225, + "reward": 0.049404874444007874, + "reward_advantage_correlation": 1.0, + "reward_std": 0.15245915856212378, + "rewards/cosine_scaled_reward": -0.031727675814181566, + "rewards/format_reward": 0.354166679084301, + "step": 409 + }, + { + "advantage_max": 0.17127191461622715, + "advantage_mean": -4.811833501094931e-09, + "advantage_min": -0.1715862648561597, + "advantage_std": 0.1427832981571555, + "completion_length": 2284.395835876465, + "epoch": 0.4685714285714286, + "grad_norm": 0.05336384102702141, + "kl": 0.051250457763671875, + "learning_rate": 1.8594235253127372e-07, + "loss": 0.0085, + "reward": 0.11855864059180021, + "reward_advantage_correlation": 1.0, + "reward_std": 0.14278329606167972, + "rewards/cosine_scaled_reward": 0.03388424590229988, + "rewards/format_reward": 0.6250000093132257, + "step": 410 + }, + { + "advantage_max": 0.15930527402088046, + "advantage_mean": 3.1820188706177532e-09, + "advantage_min": -0.14635033579543233, + "advantage_std": 0.12614521011710167, + "completion_length": 2849.9166870117188, + "epoch": 0.4697142857142857, + "grad_norm": 0.060255929827690125, + "kl": 0.06512451171875, + "learning_rate": 1.8410465752883758e-07, + "loss": 0.007, + "reward": 0.06567088048905134, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12614521756768227, + "rewards/cosine_scaled_reward": -0.026130111888051033, + "rewards/format_reward": 0.43750000558793545, + "step": 411 + }, + { + "advantage_max": 0.134061299264431, + "advantage_mean": -4.5013923699599445e-09, + "advantage_min": -0.1902531199157238, + "advantage_std": 0.13698356272652745, + "completion_length": 2615.437545776367, + "epoch": 0.47085714285714286, + "grad_norm": 0.13102658092975616, + "kl": 0.066558837890625, + "learning_rate": 1.822847957491922e-07, + "loss": 0.0256, + "reward": 0.08477710420265794, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13698356179520488, + "rewards/cosine_scaled_reward": 0.008670665323734283, + "rewards/format_reward": 0.47916667722165585, + "step": 412 + }, + { + "advantage_max": 0.11806221166625619, + "advantage_mean": -4.462587521736339e-09, + "advantage_min": -0.1252172514796257, + "advantage_std": 0.10003542294725776, + "completion_length": 1962.1875534057617, + "epoch": 0.472, + "grad_norm": 0.04213166981935501, + "kl": 0.04528236389160156, + "learning_rate": 1.804828558898332e-07, + "loss": 0.0115, + "reward": 0.10885413386859, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10003542667254806, + "rewards/cosine_scaled_reward": -0.013450214173644781, + "rewards/format_reward": 0.6666666734963655, + "step": 413 + }, + { + "advantage_max": 0.126890292391181, + "advantage_mean": 9.701277836615674e-10, + "advantage_min": -0.10505080316215754, + "advantage_std": 0.0860840454697609, + "completion_length": 2914.354217529297, + "epoch": 0.47314285714285714, + "grad_norm": 0.06571124494075775, + "kl": 0.061893463134765625, + "learning_rate": 1.7869892577476722e-07, + "loss": 0.0088, + "reward": -0.004312141099944711, + "reward_advantage_correlation": 1.0, + "reward_std": 0.08608405268751085, + "rewards/cosine_scaled_reward": -0.19047635607421398, + "rewards/format_reward": 0.35416667349636555, + "step": 414 + }, + { + "advantage_max": 0.23087134351953864, + "advantage_mean": -1.0089327637241574e-09, + "advantage_min": -0.14913060469552875, + "advantage_std": 0.14599163830280304, + "completion_length": 2982.791732788086, + "epoch": 0.4742857142857143, + "grad_norm": 0.12604767084121704, + "kl": 0.083526611328125, + "learning_rate": 1.7693309235023127e-07, + "loss": 0.0284, + "reward": 0.006746219587512314, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.14599164854735136, + "rewards/cosine_scaled_reward": -0.10747438576072454, + "rewards/format_reward": 0.2500000074505806, + "step": 415 + }, + { + "advantage_max": 0.17671362031251192, + "advantage_mean": -5.122274285285222e-09, + "advantage_min": -0.16972810495644808, + "advantage_std": 0.13787524495273829, + "completion_length": 2139.8541831970215, + "epoch": 0.4754285714285714, + "grad_norm": 0.1460166573524475, + "kl": 0.05098724365234375, + "learning_rate": 1.7518544168045524e-07, + "loss": 0.0163, + "reward": 0.08973793289624155, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13787524681538343, + "rewards/cosine_scaled_reward": -0.07056776992976665, + "rewards/format_reward": 0.6666666753590107, + "step": 416 + }, + { + "advantage_max": 0.18380063539370894, + "advantage_mean": -2.638747387262441e-09, + "advantage_min": -0.12609656807035208, + "advantage_std": 0.11879561748355627, + "completion_length": 3139.375030517578, + "epoch": 0.4765714285714286, + "grad_norm": 0.11847390979528427, + "kl": 0.088165283203125, + "learning_rate": 1.7345605894346726e-07, + "loss": 0.0137, + "reward": -0.02197717159288004, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11879561934620142, + "rewards/cosine_scaled_reward": -0.2001864407211542, + "rewards/format_reward": 0.27083334140479565, + "step": 417 + }, + { + "advantage_max": 0.15402009896934032, + "advantage_mean": 1.5522044760629683e-10, + "advantage_min": -0.16916337795555592, + "advantage_std": 0.12051357375457883, + "completion_length": 2231.395854949951, + "epoch": 0.4777142857142857, + "grad_norm": 0.04836947098374367, + "kl": 0.0673980712890625, + "learning_rate": 1.7174502842694212e-07, + "loss": 0.014, + "reward": 0.09378412109799683, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12051357794553041, + "rewards/cosine_scaled_reward": -0.01670103892683983, + "rewards/format_reward": 0.5833333414047956, + "step": 418 + }, + { + "advantage_max": 0.1730736023746431, + "advantage_mean": -2.5223320619660594e-09, + "advantage_min": -0.16914140712469816, + "advantage_std": 0.13918489590287209, + "completion_length": 2572.7708587646484, + "epoch": 0.47885714285714287, + "grad_norm": 0.07898696511983871, + "kl": 0.07297134399414062, + "learning_rate": 1.7005243352409333e-07, + "loss": 0.0067, + "reward": 0.08029933553189039, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1391849028877914, + "rewards/cosine_scaled_reward": -0.023447027429938316, + "rewards/format_reward": 0.5208333432674408, + "step": 419 + }, + { + "advantage_max": 0.1278257886879146, + "advantage_mean": 1.3969840145788837e-09, + "advantage_min": -0.16028660000301898, + "advantage_std": 0.11588235246017575, + "completion_length": 1729.8750228881836, + "epoch": 0.48, + "grad_norm": 0.045995082706213, + "kl": 0.061191558837890625, + "learning_rate": 1.6837835672960831e-07, + "loss": 0.0122, + "reward": 0.09284290811046958, + "reward_advantage_correlation": 0.9999999999999997, + "reward_std": 0.11588235734961927, + "rewards/cosine_scaled_reward": -0.09275021031498909, + "rewards/format_reward": 0.7291666753590107, + "step": 420 + }, + { + "advantage_max": 0.17663031490519643, + "advantage_mean": 2.48352696741172e-09, + "advantage_min": -0.12583108618855476, + "advantage_std": 0.11714980588294566, + "completion_length": 2878.0625534057617, + "epoch": 0.48114285714285715, + "grad_norm": 0.04500269517302513, + "kl": 0.08325958251953125, + "learning_rate": 1.6672287963562852e-07, + "loss": 0.0154, + "reward": 0.0023640617728233337, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1171498135663569, + "rewards/cosine_scaled_reward": -0.15943891881033778, + "rewards/format_reward": 0.3333333358168602, + "step": 421 + }, + { + "advantage_max": 0.15642781276255846, + "advantage_mean": -2.3283066447055134e-10, + "advantage_min": -0.16361185582354665, + "advantage_std": 0.12902699504047632, + "completion_length": 2730.6458892822266, + "epoch": 0.48228571428571426, + "grad_norm": 0.14173462986946106, + "kl": 0.0648040771484375, + "learning_rate": 1.6508608292777203e-07, + "loss": 0.0212, + "reward": 0.03911724709905684, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1290269959717989, + "rewards/cosine_scaled_reward": -0.10371733736246824, + "rewards/format_reward": 0.43750000931322575, + "step": 422 + }, + { + "advantage_max": 0.1365548074245453, + "advantage_mean": 1.1641532390860299e-09, + "advantage_min": -0.16458274144679308, + "advantage_std": 0.12328715343028307, + "completion_length": 2499.8958740234375, + "epoch": 0.48342857142857143, + "grad_norm": 0.06037713214755058, + "kl": 0.0506591796875, + "learning_rate": 1.6346804638120098e-07, + "loss": 0.0137, + "reward": 0.06676678382791579, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12328716181218624, + "rewards/cosine_scaled_reward": -0.05246353894472122, + "rewards/format_reward": 0.5000000074505806, + "step": 423 + }, + { + "advantage_max": 0.22071715723723173, + "advantage_mean": 2.3283065059276353e-10, + "advantage_min": -0.12588559091091156, + "advantage_std": 0.1304079587571323, + "completion_length": 2821.6250762939453, + "epoch": 0.4845714285714286, + "grad_norm": 0.08061812072992325, + "kl": 0.068572998046875, + "learning_rate": 1.6186884885673413e-07, + "loss": 0.0176, + "reward": 0.003499031998217106, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13040796015411615, + "rewards/cosine_scaled_reward": -0.16878705425187945, + "rewards/format_reward": 0.3541666716337204, + "step": 424 + }, + { + "advantage_max": 0.15021021151915193, + "advantage_mean": 2.7939678071131624e-09, + "advantage_min": -0.20629673171788454, + "advantage_std": 0.14314838591963053, + "completion_length": 1907.8125457763672, + "epoch": 0.4857142857142857, + "grad_norm": 0.09490145742893219, + "kl": 0.0328369140625, + "learning_rate": 1.6028856829700258e-07, + "loss": 0.0125, + "reward": 0.26927735190838575, + "reward_advantage_correlation": 1.0, + "reward_std": 0.143148398026824, + "rewards/cosine_scaled_reward": 0.3964337124489248, + "rewards/format_reward": 0.7916666772216558, + "step": 425 + }, + { + "advantage_max": 0.17377169243991375, + "advantage_mean": -1.3969839035565812e-09, + "advantage_min": -0.13985866121947765, + "advantage_std": 0.12318799132481217, + "completion_length": 2227.520881652832, + "epoch": 0.4868571428571429, + "grad_norm": 0.06765562295913696, + "kl": 0.0846099853515625, + "learning_rate": 1.5872728172265146e-07, + "loss": 0.0153, + "reward": 0.06637881277129054, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1231879978440702, + "rewards/cosine_scaled_reward": -0.11847709584981203, + "rewards/format_reward": 0.625000013038516, + "step": 426 + }, + { + "advantage_max": 0.2253029616549611, + "advantage_mean": -3.104408619059029e-09, + "advantage_min": -0.17749943817034364, + "advantage_std": 0.15275797015056014, + "completion_length": 2793.416748046875, + "epoch": 0.488, + "grad_norm": 0.12349321693181992, + "kl": 0.087188720703125, + "learning_rate": 1.5718506522858572e-07, + "loss": 0.0226, + "reward": 0.11759098537731916, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1527579859830439, + "rewards/cosine_scaled_reward": 0.10679585766047239, + "rewards/format_reward": 0.4791666753590107, + "step": 427 + }, + { + "advantage_max": 0.15732468385249376, + "advantage_mean": -4.268561941411786e-09, + "advantage_min": -0.1455386085435748, + "advantage_std": 0.12252023816108704, + "completion_length": 2145.104232788086, + "epoch": 0.48914285714285716, + "grad_norm": 0.0802699625492096, + "kl": 0.0648345947265625, + "learning_rate": 1.5566199398026147e-07, + "loss": 0.0137, + "reward": 0.09161906410008669, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1225202432833612, + "rewards/cosine_scaled_reward": -0.07740333583205938, + "rewards/format_reward": 0.687500013038516, + "step": 428 + }, + { + "advantage_max": 0.19291006959974766, + "advantage_mean": -1.3969839451899446e-09, + "advantage_min": -0.14975288417190313, + "advantage_std": 0.13649496110156178, + "completion_length": 2090.354202270508, + "epoch": 0.49028571428571427, + "grad_norm": 0.04922656714916229, + "kl": 0.06599807739257812, + "learning_rate": 1.5415814221002265e-07, + "loss": 0.0087, + "reward": 0.09877313417382538, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.13649496855214238, + "rewards/cosine_scaled_reward": -0.10497084120288491, + "rewards/format_reward": 0.7916666734963655, + "step": 429 + }, + { + "advantage_max": 0.13820017455145717, + "advantage_mean": -4.152146633462639e-09, + "advantage_min": -0.10727433580905199, + "advantage_std": 0.0930909130256623, + "completion_length": 2264.000045776367, + "epoch": 0.49142857142857144, + "grad_norm": 0.15907180309295654, + "kl": 0.07495880126953125, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.0011, + "reward": 0.12007943191565573, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09309091651812196, + "rewards/cosine_scaled_reward": 0.03994573000818491, + "rewards/format_reward": 0.6250000037252903, + "step": 430 + }, + { + "advantage_max": 0.10399939585477114, + "advantage_mean": -1.1641531211248335e-09, + "advantage_min": -0.13584416639059782, + "advantage_std": 0.09472417016513646, + "completion_length": 2309.6250534057617, + "epoch": 0.49257142857142855, + "grad_norm": 0.06702486425638199, + "kl": 0.07297134399414062, + "learning_rate": 1.5120838934595337e-07, + "loss": 0.012, + "reward": 0.04253681842237711, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09472417179495096, + "rewards/cosine_scaled_reward": -0.1354589331895113, + "rewards/format_reward": 0.5208333395421505, + "step": 431 + }, + { + "advantage_max": 0.1814054111018777, + "advantage_mean": -3.492460209919557e-10, + "advantage_min": -0.16293080430477858, + "advantage_std": 0.13140337774530053, + "completion_length": 2683.3333740234375, + "epoch": 0.4937142857142857, + "grad_norm": 0.12716400623321533, + "kl": 0.09224700927734375, + "learning_rate": 1.4976263201891613e-07, + "loss": 0.0095, + "reward": 0.07091285544447601, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13140338193625212, + "rewards/cosine_scaled_reward": -0.011083395686000586, + "rewards/format_reward": 0.43750000558793545, + "step": 432 + }, + { + "advantage_max": 0.12933768006041646, + "advantage_mean": 1.3193737630201596e-09, + "advantage_min": -0.15304936189204454, + "advantage_std": 0.1121934037655592, + "completion_length": 2891.5416870117188, + "epoch": 0.4948571428571429, + "grad_norm": 0.05526670441031456, + "kl": 0.08255767822265625, + "learning_rate": 1.483363816965435e-07, + "loss": 0.013, + "reward": 0.0877141747623682, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11219340935349464, + "rewards/cosine_scaled_reward": 0.06979287602007389, + "rewards/format_reward": 0.37500000931322575, + "step": 433 + }, + { + "advantage_max": 0.09472581138834357, + "advantage_mean": -9.701277038642875e-10, + "advantage_min": -0.09733299724757671, + "advantage_std": 0.07522567734122276, + "completion_length": 2730.625030517578, + "epoch": 0.496, + "grad_norm": 0.061184678226709366, + "kl": 0.10284423828125, + "learning_rate": 1.469297078922642e-07, + "loss": 0.0124, + "reward": -0.020490076043643057, + "reward_advantage_correlation": 1.0, + "reward_std": 0.07522568292915821, + "rewards/cosine_scaled_reward": -0.2582273744046688, + "rewards/format_reward": 0.3958333395421505, + "step": 434 + }, + { + "advantage_max": 0.14937169384211302, + "advantage_mean": 2.3283065059276353e-10, + "advantage_min": -0.11381850577890873, + "advantage_std": 0.10129550378769636, + "completion_length": 2136.229202270508, + "epoch": 0.49714285714285716, + "grad_norm": 0.08722381293773651, + "kl": 0.07655715942382812, + "learning_rate": 1.4554267916537495e-07, + "loss": 0.012, + "reward": 0.02877517172601074, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.10129550378769636, + "rewards/cosine_scaled_reward": -0.1954280026257038, + "rewards/format_reward": 0.5625000037252903, + "step": 435 + }, + { + "advantage_max": 0.10816556308418512, + "advantage_mean": -5.665545969868457e-09, + "advantage_min": -0.11551058106124401, + "advantage_std": 0.09109327476471663, + "completion_length": 1993.3333587646484, + "epoch": 0.4982857142857143, + "grad_norm": 0.040935587137937546, + "kl": 0.03719520568847656, + "learning_rate": 1.4417536311769885e-07, + "loss": 0.0024, + "reward": 0.1375275724567473, + "reward_advantage_correlation": 1.0, + "reward_std": 0.0910932756960392, + "rewards/cosine_scaled_reward": 0.12174435332417488, + "rewards/format_reward": 0.5625000018626451, + "step": 436 + }, + { + "advantage_max": 0.15487646870315075, + "advantage_mean": 4.163336342344337e-17, + "advantage_min": -0.1355657959356904, + "advantage_std": 0.12161733116954565, + "completion_length": 2353.0834197998047, + "epoch": 0.49942857142857144, + "grad_norm": 0.15399853885173798, + "kl": 0.08425140380859375, + "learning_rate": 1.4282782639029128e-07, + "loss": 0.0238, + "reward": 0.03445580159313977, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12161733210086823, + "rewards/cosine_scaled_reward": -0.15955427661538124, + "rewards/format_reward": 0.5208333376795053, + "step": 437 + }, + { + "advantage_max": 0.17049633665010333, + "advantage_mean": -1.9402554216063628e-09, + "advantage_min": -0.15271185897290707, + "advantage_std": 0.13004512721090578, + "completion_length": 2923.1875534057617, + "epoch": 0.5005714285714286, + "grad_norm": 0.1422165185213089, + "kl": 0.054004669189453125, + "learning_rate": 1.4150013466019114e-07, + "loss": 0.0172, + "reward": 0.04482788871973753, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13004513212945312, + "rewards/cosine_scaled_reward": -0.08538412535563111, + "rewards/format_reward": 0.4375000111758709, + "step": 438 + }, + { + "advantage_max": 0.12892017513513565, + "advantage_mean": -1.9402553730341054e-09, + "advantage_min": -0.102175647392869, + "advantage_std": 0.09014736721292138, + "completion_length": 2296.0000228881836, + "epoch": 0.5017142857142857, + "grad_norm": 0.035092201083898544, + "kl": 0.061981201171875, + "learning_rate": 1.4019235263722034e-07, + "loss": 0.0096, + "reward": 0.03524679830297828, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09014736441895366, + "rewards/cosine_scaled_reward": -0.08231546822935343, + "rewards/format_reward": 0.37500000186264515, + "step": 439 + }, + { + "advantage_max": 0.13992469711229205, + "advantage_mean": -2.1730860583213385e-09, + "advantage_min": -0.1054043099284172, + "advantage_std": 0.09207697911188006, + "completion_length": 2813.041717529297, + "epoch": 0.5028571428571429, + "grad_norm": 0.14594881236553192, + "kl": 0.09942626953125, + "learning_rate": 1.3890454406082956e-07, + "loss": 0.0099, + "reward": -0.01191971474327147, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.09207698097452521, + "rewards/cosine_scaled_reward": -0.21304061822593212, + "rewards/format_reward": 0.3541666716337204, + "step": 440 + }, + { + "advantage_max": 0.12328352779150009, + "advantage_mean": -3.4148496946828644e-09, + "advantage_min": -0.17189356870949268, + "advantage_std": 0.11699239769950509, + "completion_length": 2687.5208740234375, + "epoch": 0.504, + "grad_norm": 0.11024197936058044, + "kl": 0.07801437377929688, + "learning_rate": 1.3763677169699217e-07, + "loss": 0.0047, + "reward": 0.09399675484746695, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.11699240235611796, + "rewards/cosine_scaled_reward": 0.03823857568204403, + "rewards/format_reward": 0.4791666753590107, + "step": 441 + }, + { + "advantage_max": 0.1927649211138487, + "advantage_mean": -7.761021547647573e-10, + "advantage_min": -0.14677509432658553, + "advantage_std": 0.1355924210511148, + "completion_length": 2098.208366394043, + "epoch": 0.5051428571428571, + "grad_norm": 0.14278782904148102, + "kl": 0.06755447387695312, + "learning_rate": 1.3638909733514452e-07, + "loss": 0.024, + "reward": 0.04010949970688671, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1355924243107438, + "rewards/cosine_scaled_reward": -0.18566999700851738, + "rewards/format_reward": 0.604166679084301, + "step": 442 + }, + { + "advantage_max": 0.18520016921684146, + "advantage_mean": -1.7850350295112172e-09, + "advantage_min": -0.21217109076678753, + "advantage_std": 0.16153573151677847, + "completion_length": 2472.000045776367, + "epoch": 0.5062857142857143, + "grad_norm": 0.03744587302207947, + "kl": 0.033538818359375, + "learning_rate": 1.351615817851748e-07, + "loss": 0.0077, + "reward": 0.09077572450041771, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.16153573850169778, + "rewards/cosine_scaled_reward": -0.012071860954165459, + "rewards/format_reward": 0.5625000074505806, + "step": 443 + }, + { + "advantage_max": 0.16735272575169802, + "advantage_mean": -1.2417634004391331e-09, + "advantage_min": -0.10703678708523512, + "advantage_std": 0.10983354318886995, + "completion_length": 2711.916702270508, + "epoch": 0.5074285714285715, + "grad_norm": 0.1395997852087021, + "kl": 0.083709716796875, + "learning_rate": 1.3395428487445914e-07, + "loss": 0.0051, + "reward": 0.0046759541146457195, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.10983354831114411, + "rewards/cosine_scaled_reward": -0.1657371548935771, + "rewards/format_reward": 0.35416666977107525, + "step": 444 + }, + { + "advantage_max": 0.14539906289428473, + "advantage_mean": -4.423782298812462e-09, + "advantage_min": -0.1462356518022716, + "advantage_std": 0.11147704510949552, + "completion_length": 2879.6250534057617, + "epoch": 0.5085714285714286, + "grad_norm": 0.10607258975505829, + "kl": 0.10732269287109375, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.0139, + "reward": 0.038264825008809566, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.11147704510949552, + "rewards/cosine_scaled_reward": -0.10536502301692963, + "rewards/format_reward": 0.4375000111758709, + "step": 445 + }, + { + "advantage_max": 0.18112129345536232, + "advantage_mean": -1.6298145749660264e-09, + "advantage_min": -0.1495385468006134, + "advantage_std": 0.12615549704059958, + "completion_length": 2642.5208892822266, + "epoch": 0.5097142857142857, + "grad_norm": 0.18339461088180542, + "kl": 0.07524871826171875, + "learning_rate": 1.316005813502869e-07, + "loss": 0.0173, + "reward": 0.041458213003352284, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.12615550868213177, + "rewards/cosine_scaled_reward": -0.13958797266241163, + "rewards/format_reward": 0.5208333376795053, + "step": 446 + }, + { + "advantage_max": 0.20427228696644306, + "advantage_mean": 3.1044089521259366e-10, + "advantage_min": -0.1569793475791812, + "advantage_std": 0.14149027224630117, + "completion_length": 2453.458427429199, + "epoch": 0.5108571428571429, + "grad_norm": 0.05572226271033287, + "kl": 0.0703887939453125, + "learning_rate": 1.3045428945301953e-07, + "loss": 0.0117, + "reward": 0.09940684377215803, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.14149027597159147, + "rewards/cosine_scaled_reward": -0.06487349630333483, + "rewards/format_reward": 0.7083333376795053, + "step": 447 + }, + { + "advantage_max": 0.14345520036295056, + "advantage_mean": -4.656613053488634e-09, + "advantage_min": -0.14839566173031926, + "advantage_std": 0.11676807375624776, + "completion_length": 2164.645881652832, + "epoch": 0.512, + "grad_norm": 0.11576156318187714, + "kl": 0.07922554016113281, + "learning_rate": 1.2932844562179352e-07, + "loss": 0.0173, + "reward": 0.11762388469651341, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.1167680760845542, + "rewards/cosine_scaled_reward": 0.03612196817994118, + "rewards/format_reward": 0.6041666753590107, + "step": 448 + }, + { + "advantage_max": 0.12314849765971303, + "advantage_mean": -2.8715780586718864e-09, + "advantage_min": -0.09987784596160054, + "advantage_std": 0.09292110335081816, + "completion_length": 1891.7500534057617, + "epoch": 0.5131428571428571, + "grad_norm": 0.036677490919828415, + "kl": 0.051727294921875, + "learning_rate": 1.2822310472864885e-07, + "loss": 0.0053, + "reward": 0.05903090629726648, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.09292111033573747, + "rewards/cosine_scaled_reward": -0.1294665727764368, + "rewards/format_reward": 0.6041666753590107, + "step": 449 + }, + { + "advantage_max": 0.12233589449897408, + "advantage_mean": -2.483526884144993e-09, + "advantage_min": -0.12476734491065145, + "advantage_std": 0.09501770418137312, + "completion_length": 2092.4791946411133, + "epoch": 0.5142857142857142, + "grad_norm": 0.08849738538265228, + "kl": 0.0531463623046875, + "learning_rate": 1.2713832064634125e-07, + "loss": 0.0012, + "reward": 0.07593667833134532, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09501770790666342, + "rewards/cosine_scaled_reward": -0.08989906311035156, + "rewards/format_reward": 0.6250000055879354, + "step": 450 + }, + { + "advantage_max": 0.17539388965815306, + "advantage_mean": -6.519258161086228e-09, + "advantage_min": -0.13914753962308168, + "advantage_std": 0.1160832904279232, + "completion_length": 2289.95841217041, + "epoch": 0.5154285714285715, + "grad_norm": 0.08232926577329636, + "kl": 0.0707550048828125, + "learning_rate": 1.260741462457165e-07, + "loss": 0.0146, + "reward": 0.1173245128011331, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.11608328856527805, + "rewards/cosine_scaled_reward": 0.010675758589059114, + "rewards/format_reward": 0.6666666828095913, + "step": 451 + }, + { + "advantage_max": 0.2101496052928269, + "advantage_mean": -4.190951877203197e-09, + "advantage_min": -0.11972010880708694, + "advantage_std": 0.12568515678867698, + "completion_length": 2983.8750610351562, + "epoch": 0.5165714285714286, + "grad_norm": 0.05641166865825653, + "kl": 0.0640869140625, + "learning_rate": 1.2503063339313356e-07, + "loss": 0.0053, + "reward": 0.09685743995942175, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12568516144528985, + "rewards/cosine_scaled_reward": 0.06668644212186337, + "rewards/format_reward": 0.43750000558793545, + "step": 452 + }, + { + "advantage_max": 0.205594627186656, + "advantage_mean": -3.182018815106602e-09, + "advantage_min": -0.14165670238435268, + "advantage_std": 0.14093559887260199, + "completion_length": 2465.9792556762695, + "epoch": 0.5177142857142857, + "grad_norm": 0.06710201501846313, + "kl": 0.05599212646484375, + "learning_rate": 1.2400783294793668e-07, + "loss": 0.0055, + "reward": 0.06225473037920892, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.14093559375032783, + "rewards/cosine_scaled_reward": -0.10087771527469158, + "rewards/format_reward": 0.5625000018626451, + "step": 453 + }, + { + "advantage_max": 0.16545915231108665, + "advantage_mean": 3.2596291082986895e-09, + "advantage_min": -0.1489626714028418, + "advantage_std": 0.13455732073634863, + "completion_length": 2598.0833892822266, + "epoch": 0.5188571428571429, + "grad_norm": 0.041934214532375336, + "kl": 0.04929351806640625, + "learning_rate": 1.2300579475997657e-07, + "loss": 0.0007, + "reward": 0.05813994584605098, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.13455732772126794, + "rewards/cosine_scaled_reward": -0.13080565445125103, + "rewards/format_reward": 0.6041666716337204, + "step": 454 + }, + { + "advantage_max": 0.14655470103025436, + "advantage_mean": 1.1641532182693481e-10, + "advantage_min": -0.12023748084902763, + "advantage_std": 0.1034624595195055, + "completion_length": 2623.9375610351562, + "epoch": 0.52, + "grad_norm": 0.09280390292406082, + "kl": 0.0705718994140625, + "learning_rate": 1.220245676671809e-07, + "loss": 0.0236, + "reward": -0.008920757623855025, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.10346246417611837, + "rewards/cosine_scaled_reward": -0.22455782443284988, + "rewards/format_reward": 0.3958333432674408, + "step": 455 + }, + { + "advantage_max": 0.1612388575449586, + "advantage_mean": 3.8805108432127255e-11, + "advantage_min": -0.08964319387450814, + "advantage_std": 0.09334565093740821, + "completion_length": 2911.5208740234375, + "epoch": 0.5211428571428571, + "grad_norm": 0.045997679233551025, + "kl": 0.07303619384765625, + "learning_rate": 1.2106419949317388e-07, + "loss": 0.0133, + "reward": -0.03551657311618328, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.09334565559402108, + "rewards/cosine_scaled_reward": -0.24067565985023975, + "rewards/format_reward": 0.2708333432674408, + "step": 456 + }, + { + "advantage_max": 0.13807542622089386, + "advantage_mean": -1.940255463239726e-09, + "advantage_min": -0.12592454068362713, + "advantage_std": 0.10808086302131414, + "completion_length": 2840.4375610351562, + "epoch": 0.5222857142857142, + "grad_norm": 0.06265939027070999, + "kl": 0.0859375, + "learning_rate": 1.2012473704494537e-07, + "loss": 0.0117, + "reward": 0.005134745966643095, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1080808648839593, + "rewards/cosine_scaled_reward": -0.1515765618532896, + "rewards/format_reward": 0.3333333358168602, + "step": 457 + }, + { + "advantage_max": 0.18150201439857483, + "advantage_mean": 2.793967751602011e-09, + "advantage_min": -0.13712891470640898, + "advantage_std": 0.13010421255603433, + "completion_length": 2471.312515258789, + "epoch": 0.5234285714285715, + "grad_norm": 0.07266143709421158, + "kl": 0.0667877197265625, + "learning_rate": 1.1920622611056974e-07, + "loss": 0.0112, + "reward": 0.03780588391236961, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13010421255603433, + "rewards/cosine_scaled_reward": -0.13179424591362476, + "rewards/format_reward": 0.4791666753590107, + "step": 458 + }, + { + "advantage_max": 0.2504174951463938, + "advantage_mean": -7.450580846724009e-09, + "advantage_min": -0.23487072996795177, + "advantage_std": 0.19904521945863962, + "completion_length": 2292.5833740234375, + "epoch": 0.5245714285714286, + "grad_norm": 0.07614655047655106, + "kl": 0.0789947509765625, + "learning_rate": 1.1830871145697412e-07, + "loss": 0.0051, + "reward": 0.14349280833266675, + "reward_advantage_correlation": 1.0, + "reward_std": 0.19904522132128477, + "rewards/cosine_scaled_reward": 0.10745827108621597, + "rewards/format_reward": 0.6250000037252903, + "step": 459 + }, + { + "advantage_max": 0.1652562553063035, + "advantage_mean": -8.149072666663315e-10, + "advantage_min": -0.15781007520854473, + "advantage_std": 0.13692284328863025, + "completion_length": 2972.916748046875, + "epoch": 0.5257142857142857, + "grad_norm": 0.256010502576828, + "kl": 0.0777587890625, + "learning_rate": 1.1743223682775649e-07, + "loss": 0.0243, + "reward": 0.012114565295632929, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13692284747958183, + "rewards/cosine_scaled_reward": -0.16331932321190834, + "rewards/format_reward": 0.3958333432674408, + "step": 460 + }, + { + "advantage_max": 0.1954550128430128, + "advantage_mean": -3.8805108432127255e-11, + "advantage_min": -0.16478789877146482, + "advantage_std": 0.1376098650507629, + "completion_length": 2490.5833587646484, + "epoch": 0.5268571428571428, + "grad_norm": 0.04703768342733383, + "kl": 0.053020477294921875, + "learning_rate": 1.1657684494105386e-07, + "loss": 0.0128, + "reward": 0.04842737386934459, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13760986551642418, + "rewards/cosine_scaled_reward": -0.10884158127009869, + "rewards/format_reward": 0.5000000074505806, + "step": 461 + }, + { + "advantage_max": 0.10769666731357574, + "advantage_mean": -9.895302181817112e-10, + "advantage_min": -0.09830914624035358, + "advantage_std": 0.07689572288654745, + "completion_length": 2611.68754196167, + "epoch": 0.528, + "grad_norm": 0.06967299431562424, + "kl": 0.0802764892578125, + "learning_rate": 1.1574257748745986e-07, + "loss": 0.014, + "reward": -0.02583047526422888, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.07689572405070066, + "rewards/cosine_scaled_reward": -0.2742087193764746, + "rewards/format_reward": 0.39583334513008595, + "step": 462 + }, + { + "advantage_max": 0.11235859198495746, + "advantage_mean": -7.761020853758183e-10, + "advantage_min": -0.10793718742206693, + "advantage_std": 0.08298033010214567, + "completion_length": 2999.4583587646484, + "epoch": 0.5291428571428571, + "grad_norm": 0.0980202853679657, + "kl": 0.07388687133789062, + "learning_rate": 1.1492947512799328e-07, + "loss": 0.008, + "reward": 0.028239358702194295, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.08298033056780696, + "rewards/cosine_scaled_reward": -0.061539738439023495, + "rewards/format_reward": 0.29166666977107525, + "step": 463 + }, + { + "advantage_max": 0.07747854851186275, + "advantage_mean": -2.173086016687975e-09, + "advantage_min": -0.10103305988013744, + "advantage_std": 0.07666776818223298, + "completion_length": 2035.020851135254, + "epoch": 0.5302857142857142, + "grad_norm": 0.11843991279602051, + "kl": 0.06780242919921875, + "learning_rate": 1.1413757749211602e-07, + "loss": 0.0076, + "reward": 0.12095656874589622, + "reward_advantage_correlation": 1.0, + "reward_std": 0.0766677693463862, + "rewards/cosine_scaled_reward": 0.06199156865477562, + "rewards/format_reward": 0.5833333432674408, + "step": 464 + }, + { + "advantage_max": 0.19982548616826534, + "advantage_mean": -2.2506963168189564e-09, + "advantage_min": -0.16617505624890327, + "advantage_std": 0.1316032218746841, + "completion_length": 2798.500030517578, + "epoch": 0.5314285714285715, + "grad_norm": 0.10557418316602707, + "kl": 0.0775604248046875, + "learning_rate": 1.1336692317580158e-07, + "loss": 0.0201, + "reward": 0.0002755961613729596, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1316032288596034, + "rewards/cosine_scaled_reward": -0.19768394669517875, + "rewards/format_reward": 0.39583334885537624, + "step": 465 + }, + { + "advantage_max": 0.14168964140117168, + "advantage_mean": 3.6864852975826423e-09, + "advantage_min": -0.13088442478328943, + "advantage_std": 0.12304930435493588, + "completion_length": 2720.0833740234375, + "epoch": 0.5325714285714286, + "grad_norm": 0.06551773101091385, + "kl": 0.08535003662109375, + "learning_rate": 1.1261754973965422e-07, + "loss": 0.0194, + "reward": 0.05027168616652489, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12304930854588747, + "rewards/cosine_scaled_reward": -0.018773799762129784, + "rewards/format_reward": 0.3333333395421505, + "step": 466 + }, + { + "advantage_max": 0.14841501927003264, + "advantage_mean": -4.346172137459359e-09, + "advantage_min": -0.13719767704606056, + "advantage_std": 0.10813061986118555, + "completion_length": 2950.7708740234375, + "epoch": 0.5337142857142857, + "grad_norm": 0.0421525277197361, + "kl": 0.078033447265625, + "learning_rate": 1.1188949370707787e-07, + "loss": 0.0125, + "reward": 0.018481011386029422, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.10813062265515327, + "rewards/cosine_scaled_reward": -0.12253948114812374, + "rewards/format_reward": 0.354166679084301, + "step": 467 + }, + { + "advantage_max": 0.14324644766747952, + "advantage_mean": -2.0372682273811504e-10, + "advantage_min": -0.0815016619162634, + "advantage_std": 0.08769956149626523, + "completion_length": 2793.041679382324, + "epoch": 0.5348571428571428, + "grad_norm": 0.06327426433563232, + "kl": 0.0679473876953125, + "learning_rate": 1.1118279056249653e-07, + "loss": 0.0069, + "reward": -0.028568633482791483, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.08769956947071478, + "rewards/cosine_scaled_reward": -0.26157646207138896, + "rewards/format_reward": 0.3541666679084301, + "step": 468 + }, + { + "advantage_max": 0.10998845845460892, + "advantage_mean": 3.0656035349130306e-09, + "advantage_min": -0.09244827646762133, + "advantage_std": 0.07723978580906987, + "completion_length": 2816.6666946411133, + "epoch": 0.536, + "grad_norm": 0.07730638980865479, + "kl": 0.0949859619140625, + "learning_rate": 1.1049747474962444e-07, + "loss": 0.0132, + "reward": 0.026579681783914566, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.07723978534340858, + "rewards/cosine_scaled_reward": -0.07851309701800346, + "rewards/format_reward": 0.31250000186264515, + "step": 469 + }, + { + "advantage_max": 0.17276223795488477, + "advantage_mean": 4.035731332452386e-09, + "advantage_min": -0.15901347948238254, + "advantage_std": 0.1274056350812316, + "completion_length": 2975.437545776367, + "epoch": 0.5371428571428571, + "grad_norm": 0.05028533563017845, + "kl": 0.07293701171875, + "learning_rate": 1.0983357966978745e-07, + "loss": 0.0147, + "reward": 0.06911044649314135, + "reward_advantage_correlation": 1.0, + "reward_std": 0.12740564392879605, + "rewards/cosine_scaled_reward": -0.037810999900102615, + "rewards/format_reward": 0.47916667349636555, + "step": 470 + }, + { + "advantage_max": 0.14420464355498552, + "advantage_mean": -5.3551047971001076e-09, + "advantage_min": -0.09883822966367006, + "advantage_std": 0.09858017042279243, + "completion_length": 2936.1458740234375, + "epoch": 0.5382857142857143, + "grad_norm": 0.07671011984348297, + "kl": 0.071929931640625, + "learning_rate": 1.0919113768029517e-07, + "loss": 0.0169, + "reward": 0.04580727685242891, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09858017321676016, + "rewards/cosine_scaled_reward": -0.03325042815413326, + "rewards/format_reward": 0.3333333358168602, + "step": 471 + }, + { + "advantage_max": 0.12537370715290308, + "advantage_mean": -1.9014503166436825e-09, + "advantage_min": -0.12702995259314775, + "advantage_std": 0.105596958193928, + "completion_length": 2686.125030517578, + "epoch": 0.5394285714285715, + "grad_norm": 0.052131302654743195, + "kl": 0.05517578125, + "learning_rate": 1.0857018009286381e-07, + "loss": 0.0072, + "reward": 0.03351810248568654, + "reward_advantage_correlation": 0.9999999999999997, + "reward_std": 0.10559695912525058, + "rewards/cosine_scaled_reward": -0.15133550064638257, + "rewards/format_reward": 0.5000000111758709, + "step": 472 + }, + { + "advantage_max": 0.16438235435634851, + "advantage_mean": 1.7462298690373856e-09, + "advantage_min": -0.1541678113862872, + "advantage_std": 0.12999227130785584, + "completion_length": 2772.229202270508, + "epoch": 0.5405714285714286, + "grad_norm": 0.20341692864894867, + "kl": 0.0763702392578125, + "learning_rate": 1.0797073717209013e-07, + "loss": 0.0181, + "reward": 0.011351976543664932, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.1299922768957913, + "rewards/cosine_scaled_reward": -0.15522437915205956, + "rewards/format_reward": 0.3750000111758709, + "step": 473 + }, + { + "advantage_max": 0.12965345289558172, + "advantage_mean": -1.0477379214224314e-08, + "advantage_min": -0.13344207033514977, + "advantage_std": 0.1010923438007012, + "completion_length": 2297.3333740234375, + "epoch": 0.5417142857142857, + "grad_norm": 0.03711497038602829, + "kl": 0.05191802978515625, + "learning_rate": 1.0739283813397639e-07, + "loss": 0.0078, + "reward": 0.1403076218557544, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10109234321862459, + "rewards/cosine_scaled_reward": 0.16363799665123224, + "rewards/format_reward": 0.5000000093132257, + "step": 474 + }, + { + "advantage_max": 0.24887295626103878, + "advantage_mean": -1.1641532182693481e-09, + "advantage_min": -0.16283766739070415, + "advantage_std": 0.16864926600828767, + "completion_length": 2069.479217529297, + "epoch": 0.5428571428571428, + "grad_norm": 0.20244264602661133, + "kl": 0.05932807922363281, + "learning_rate": 1.068365111445064e-07, + "loss": 0.02, + "reward": 0.10613728279713541, + "reward_advantage_correlation": 1.0, + "reward_std": 0.16864926647394896, + "rewards/cosine_scaled_reward": -0.0021053925156593323, + "rewards/format_reward": 0.6250000037252903, + "step": 475 + }, + { + "advantage_max": 0.2350059635937214, + "advantage_mean": -3.1044085357923024e-09, + "advantage_min": -0.2852534279227257, + "advantage_std": 0.22215755190700293, + "completion_length": 2249.7500762939453, + "epoch": 0.544, + "grad_norm": 0.27493569254875183, + "kl": 0.056476593017578125, + "learning_rate": 1.063017833182728e-07, + "loss": 0.0296, + "reward": 0.18973252084106207, + "reward_advantage_correlation": 1.0, + "reward_std": 0.22215756494551897, + "rewards/cosine_scaled_reward": 0.2016936163417995, + "rewards/format_reward": 0.7083333563059568, + "step": 476 + }, + { + "advantage_max": 0.20565590541809797, + "advantage_mean": 2.173086016687975e-09, + "advantage_min": -0.20990248955786228, + "advantage_std": 0.16222712211310863, + "completion_length": 1889.9792137145996, + "epoch": 0.5451428571428572, + "grad_norm": 0.08550294488668442, + "kl": 0.049556732177734375, + "learning_rate": 1.0578868071715544e-07, + "loss": 0.007, + "reward": 0.1523238776717335, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1622271267697215, + "rewards/cosine_scaled_reward": 0.10249492339789867, + "rewards/format_reward": 0.6875000223517418, + "step": 477 + }, + { + "advantage_max": 0.19569236552342772, + "advantage_mean": -5.355105053839182e-09, + "advantage_min": -0.170789347961545, + "advantage_std": 0.14821833465248346, + "completion_length": 2707.7917251586914, + "epoch": 0.5462857142857143, + "grad_norm": 0.0707746297121048, + "kl": 0.04302215576171875, + "learning_rate": 1.0529722834905125e-07, + "loss": 0.0062, + "reward": 0.08682430069893599, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.14821833930909634, + "rewards/cosine_scaled_reward": 0.005293058231472969, + "rewards/format_reward": 0.5000000111758709, + "step": 478 + }, + { + "advantage_max": 0.20708859246224165, + "advantage_mean": 3.49245968256362e-09, + "advantage_min": -0.0934189772233367, + "advantage_std": 0.11944379657506943, + "completion_length": 2784.8750610351562, + "epoch": 0.5474285714285714, + "grad_norm": 0.06040840968489647, + "kl": 0.058135986328125, + "learning_rate": 1.0482745016665526e-07, + "loss": 0.0089, + "reward": 0.025700575672090054, + "reward_advantage_correlation": 1.0, + "reward_std": 0.11944379936903715, + "rewards/cosine_scaled_reward": -0.18563230335712433, + "rewards/format_reward": 0.5208333376795053, + "step": 479 + }, + { + "advantage_max": 0.09567822678945959, + "advantage_mean": -1.358178718796621e-09, + "advantage_min": -0.12376060243695974, + "advantage_std": 0.08115771319717169, + "completion_length": 2284.291732788086, + "epoch": 0.5485714285714286, + "grad_norm": 0.04082602262496948, + "kl": 0.0604248046875, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.0068, + "reward": 0.0292730022338219, + "reward_advantage_correlation": 1.0, + "reward_std": 0.08115771505981684, + "rewards/cosine_scaled_reward": -0.17489056783961132, + "rewards/format_reward": 0.5208333469927311, + "step": 480 + }, + { + "advantage_max": 0.15009752474725246, + "advantage_mean": -1.2417634628891783e-09, + "advantage_min": -0.10813257563859224, + "advantage_std": 0.0981076592579484, + "completion_length": 2793.0625762939453, + "epoch": 0.5497142857142857, + "grad_norm": 0.039628688246011734, + "kl": 0.0657196044921875, + "learning_rate": 1.0395300688680625e-07, + "loss": 0.012, + "reward": 0.006433199101593345, + "reward_advantage_correlation": 1.0, + "reward_std": 0.09810765460133553, + "rewards/cosine_scaled_reward": -0.24164481833577156, + "rewards/format_reward": 0.5208333469927311, + "step": 481 + }, + { + "advantage_max": 0.18115054722875357, + "advantage_mean": 3.3372393737352013e-09, + "advantage_min": -0.11325318366289139, + "advantage_std": 0.11549648176878691, + "completion_length": 2272.8958892822266, + "epoch": 0.5508571428571428, + "grad_norm": 0.05143104866147041, + "kl": 0.0427398681640625, + "learning_rate": 1.0354838440848501e-07, + "loss": 0.0101, + "reward": 0.08894058922305703, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1154964854940772, + "rewards/cosine_scaled_reward": -0.01966316059406381, + "rewards/format_reward": 0.5625000055879354, + "step": 482 + }, + { + "advantage_max": 0.15216759871691465, + "advantage_mean": -4.190951752303107e-09, + "advantage_min": -0.1571501288563013, + "advantage_std": 0.11147296661511064, + "completion_length": 2788.2708740234375, + "epoch": 0.552, + "grad_norm": 0.07072841376066208, + "kl": 0.05242156982421875, + "learning_rate": 1.0316552135205837e-07, + "loss": 0.0069, + "reward": 0.047933751717209816, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.11147296894341707, + "rewards/cosine_scaled_reward": -0.10883669415488839, + "rewards/format_reward": 0.5000000223517418, + "step": 483 + }, + { + "advantage_max": 0.17311536194756627, + "advantage_mean": -3.880510884846089e-09, + "advantage_min": -0.14954284392297268, + "advantage_std": 0.1357504017651081, + "completion_length": 2194.6458625793457, + "epoch": 0.5531428571428572, + "grad_norm": 0.06790361553430557, + "kl": 0.04769134521484375, + "learning_rate": 1.0280443637773163e-07, + "loss": 0.0047, + "reward": 0.08819579007104039, + "reward_advantage_correlation": 1.0, + "reward_std": 0.13575040455907583, + "rewards/cosine_scaled_reward": -0.04436913412064314, + "rewards/format_reward": 0.6041666679084301, + "step": 484 + }, + { + "advantage_max": 0.2007480701431632, + "advantage_mean": -1.7074247016246602e-09, + "advantage_min": -0.21596927661448717, + "advantage_std": 0.15590744372457266, + "completion_length": 2008.2708892822266, + "epoch": 0.5542857142857143, + "grad_norm": 0.1535339206457138, + "kl": 0.056400299072265625, + "learning_rate": 1.0246514708427701e-07, + "loss": 0.0204, + "reward": 0.11056611873209476, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1559074493125081, + "rewards/cosine_scaled_reward": -0.017422407865524292, + "rewards/format_reward": 0.6875000204890966, + "step": 485 + }, + { + "advantage_max": 0.13929086178541183, + "advantage_mean": -5.665545983746245e-09, + "advantage_min": -0.08767893025651574, + "advantage_std": 0.08762249257415533, + "completion_length": 2356.1667098999023, + "epoch": 0.5554285714285714, + "grad_norm": 0.1545788198709488, + "kl": 0.08535957336425781, + "learning_rate": 1.0214767000817596e-07, + "loss": 0.0109, + "reward": 0.0351280951872468, + "reward_advantage_correlation": 1.0, + "reward_std": 0.08762249490246177, + "rewards/cosine_scaled_reward": -0.11772407731041312, + "rewards/format_reward": 0.43750000186264515, + "step": 486 + }, + { + "advantage_max": 0.1587705770507455, + "advantage_mean": -7.101335020021082e-09, + "advantage_min": -0.15303611755371094, + "advantage_std": 0.1276407791301608, + "completion_length": 1725.5208358764648, + "epoch": 0.5565714285714286, + "grad_norm": 0.025498030707240105, + "kl": 0.0302734375, + "learning_rate": 1.0185202062281336e-07, + "loss": 0.0019, + "reward": 0.18238393031060696, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.12764078192412853, + "rewards/cosine_scaled_reward": 0.16194906132295728, + "rewards/format_reward": 0.7500000111758709, + "step": 487 + }, + { + "advantage_max": 0.09379342943429947, + "advantage_mean": -1.0865430222217753e-09, + "advantage_min": -0.14438163582235575, + "advantage_std": 0.08854867145419121, + "completion_length": 1983.2500381469727, + "epoch": 0.5577142857142857, + "grad_norm": 0.07698236405849457, + "kl": 0.0438690185546875, + "learning_rate": 1.0157821333772304e-07, + "loss": 0.0027, + "reward": 0.04897049597639125, + "reward_advantage_correlation": 1.0, + "reward_std": 0.08854867285117507, + "rewards/cosine_scaled_reward": -0.16868338361382484, + "rewards/format_reward": 0.6250000223517418, + "step": 488 + }, + { + "advantage_max": 0.15139921591617167, + "advantage_mean": 1.7074247571358114e-09, + "advantage_min": -0.1027588089928031, + "advantage_std": 0.10175243532285094, + "completion_length": 3048.5833892822266, + "epoch": 0.5588571428571428, + "grad_norm": 0.08280462771654129, + "kl": 0.09661865234375, + "learning_rate": 1.013262614978859e-07, + "loss": 0.0149, + "reward": -0.036369886714965105, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10175243951380253, + "rewards/cosine_scaled_reward": -0.23218814376741648, + "rewards/format_reward": 0.2500000074505806, + "step": 489 + }, + { + "advantage_max": 0.16806945484131575, + "advantage_mean": -5.587935794637566e-09, + "advantage_min": -0.10560749378055334, + "advantage_std": 0.10307761421427131, + "completion_length": 2269.4375610351562, + "epoch": 0.56, + "grad_norm": 0.04493989422917366, + "kl": 0.047054290771484375, + "learning_rate": 1.0109617738307911e-07, + "loss": 0.0087, + "reward": 0.0595971189904958, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.1030776179395616, + "rewards/cosine_scaled_reward": -0.1576940380036831, + "rewards/format_reward": 0.6666666772216558, + "step": 490 + }, + { + "advantage_max": 0.16114369360730052, + "advantage_mean": -2.910383146287332e-09, + "advantage_min": -0.17943292623385787, + "advantage_std": 0.13655775994993746, + "completion_length": 2471.375068664551, + "epoch": 0.5611428571428572, + "grad_norm": 0.07799820601940155, + "kl": 0.08380126953125, + "learning_rate": 1.0088797220727779e-07, + "loss": 0.0112, + "reward": 0.15404712711460888, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13655776320956647, + "rewards/cosine_scaled_reward": 0.16089667566120625, + "rewards/format_reward": 0.5833333376795053, + "step": 491 + }, + { + "advantage_max": 0.13228206429630518, + "advantage_mean": -1.785034897672233e-09, + "advantage_min": -0.1156699366401881, + "advantage_std": 0.10165782272815704, + "completion_length": 2402.5833892822266, + "epoch": 0.5622857142857143, + "grad_norm": 0.11941049993038177, + "kl": 0.1121826171875, + "learning_rate": 1.0070165611810855e-07, + "loss": 0.0104, + "reward": 0.04288489208556712, + "reward_advantage_correlation": 1.0, + "reward_std": 0.10165782598778605, + "rewards/cosine_scaled_reward": -0.02407931163907051, + "rewards/format_reward": 0.2916666716337204, + "step": 492 + }, + { + "advantage_max": 0.2312077321112156, + "advantage_mean": -5.5103253071564495e-09, + "advantage_min": -0.20651198737323284, + "advantage_std": 0.17073472030460835, + "completion_length": 2009.2500534057617, + "epoch": 0.5634285714285714, + "grad_norm": 0.08874164521694183, + "kl": 0.06610870361328125, + "learning_rate": 1.005372381963547e-07, + "loss": 0.015, + "reward": 0.12461904282099567, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.17073472402989864, + "rewards/cosine_scaled_reward": 0.02050326857715845, + "rewards/format_reward": 0.6875000074505806, + "step": 493 + }, + { + "advantage_max": 0.16276492271572351, + "advantage_mean": 6.984919101449272e-10, + "advantage_min": -0.107215684838593, + "advantage_std": 0.10512557066977024, + "completion_length": 2005.8333587646484, + "epoch": 0.5645714285714286, + "grad_norm": 0.026444094255566597, + "kl": 0.0601348876953125, + "learning_rate": 1.0039472645551372e-07, + "loss": 0.009, + "reward": 0.1630040816962719, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.10512557113543153, + "rewards/cosine_scaled_reward": 0.03179990313947201, + "rewards/format_reward": 0.8958333395421505, + "step": 494 + }, + { + "advantage_max": 0.15834691934287548, + "advantage_mean": -3.9581211780381764e-09, + "advantage_min": -0.12651699222624302, + "advantage_std": 0.12769749155268073, + "completion_length": 2761.166679382324, + "epoch": 0.5657142857142857, + "grad_norm": 0.06954298168420792, + "kl": 0.0700836181640625, + "learning_rate": 1.002741278414069e-07, + "loss": 0.0113, + "reward": 0.020432849414646626, + "reward_advantage_correlation": 1.0, + "reward_std": 0.1276974929496646, + "rewards/cosine_scaled_reward": -0.11860458739101887, + "rewards/format_reward": 0.3541666753590107, + "step": 495 + }, + { + "advantage_max": 0.1300267931073904, + "advantage_mean": -7.761022102759085e-10, + "advantage_min": -0.10878966562449932, + "advantage_std": 0.08884454821236432, + "completion_length": 1965.7708740234375, + "epoch": 0.5668571428571428, + "grad_norm": 0.03823034092783928, + "kl": 0.043365478515625, + "learning_rate": 1.0017544823184055e-07, + "loss": 0.0057, + "reward": 0.13961502793245018, + "reward_advantage_correlation": 1.0, + "reward_std": 0.08884455054067075, + "rewards/cosine_scaled_reward": 0.09778555016964674, + "rewards/format_reward": 0.6250000055879354, + "step": 496 + }, + { + "advantage_max": 0.16680945828557014, + "advantage_mean": -1.552204281773939e-09, + "advantage_min": -0.16890859883278608, + "advantage_std": 0.13287213910371065, + "completion_length": 2398.020866394043, + "epoch": 0.568, + "grad_norm": 0.05378331243991852, + "kl": 0.062191009521484375, + "learning_rate": 1.0009869243631952e-07, + "loss": 0.0095, + "reward": 0.1791801903546002, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13287214329466224, + "rewards/cosine_scaled_reward": 0.2254606424830854, + "rewards/format_reward": 0.6041666734963655, + "step": 497 + }, + { + "advantage_max": 0.24626314919441938, + "advantage_mean": -7.761021270091817e-10, + "advantage_min": -0.15786647517234087, + "advantage_std": 0.1566294403746724, + "completion_length": 2678.2292404174805, + "epoch": 0.5691428571428572, + "grad_norm": 0.10654427111148834, + "kl": 0.0792694091796875, + "learning_rate": 1.000438641958131e-07, + "loss": 0.0229, + "reward": 0.043071957159554586, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.15662944316864014, + "rewards/cosine_scaled_reward": -0.10418755980208516, + "rewards/format_reward": 0.4583333432674408, + "step": 498 + }, + { + "advantage_max": 0.1925945421680808, + "advantage_mean": -3.6476802134366437e-09, + "advantage_min": -0.2209562873467803, + "advantage_std": 0.1784151755273342, + "completion_length": 2237.104217529297, + "epoch": 0.5702857142857143, + "grad_norm": 0.06882494688034058, + "kl": 0.047565460205078125, + "learning_rate": 1.0001096618257236e-07, + "loss": 0.0128, + "reward": 0.15613868786022067, + "reward_advantage_correlation": 0.9999999999999998, + "reward_std": 0.1784151801839471, + "rewards/cosine_scaled_reward": 0.10448653064668179, + "rewards/format_reward": 0.7083333432674408, + "step": 499 + }, + { + "advantage_max": 0.1740710544399917, + "advantage_mean": -2.793967751602011e-09, + "advantage_min": -0.15050521213561296, + "advantage_std": 0.135882212780416, + "completion_length": 2955.3959045410156, + "epoch": 0.5714285714285714, + "grad_norm": 0.10638931393623352, + "kl": 0.091705322265625, + "learning_rate": 1e-07, + "loss": 0.016, + "reward": 0.022939922448131256, + "reward_advantage_correlation": 0.9999999999999999, + "reward_std": 0.13588221883401275, + "rewards/cosine_scaled_reward": -0.14141679741442204, + "rewards/format_reward": 0.4166666753590107, + "step": 500 + }, + { + "epoch": 0.5714285714285714, + "step": 500, + "total_flos": 0.0, + "train_loss": 0.007517670260873274, + "train_runtime": 53153.3665, + "train_samples_per_second": 0.452, + "train_steps_per_second": 0.009 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 25, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..8090319 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:541ad8a10b1e5b2e478a4179066a576ea08c7f01ec5162c968fdd06d4d374cb7 +size 8440