commit c97062218351e4cb5dddbf7a1af3d6284f323656 Author: ModelHub XC Date: Tue Jun 16 12:39:17 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: mimoidochi/OpenRS-GRPO Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..4df3363 --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +datasets: knoveleng/open-rs +library_name: transformers +model_name: OpenRS-GRPO +tags: +- generated_from_trainer +- open-r1 +- trl +- grpo +licence: license +--- + +# Model Card for OpenRS-GRPO + +This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) on the [knoveleng/open-rs](https://huggingface.co/datasets/knoveleng/open-rs) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="mimoidochi/OpenRS-GRPO", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/vrshy-stanford/huggingface/runs/b44fvgqi) + + +This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300). + +### Framework versions + +- TRL: 0.14.0 +- Transformers: 4.49.0 +- Pytorch: 2.5.1 +- Datasets: 4.5.0 +- Tokenizers: 0.21.4 + +## Citations + +Cite GRPO as: + +```bibtex +@article{zhihong2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, +} + +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..c51d466 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": 0.0008326698157195504, + "train_runtime": 30165.0377, + "train_samples": 7000, + "train_samples_per_second": 0.239, + "train_steps_per_second": 0.01 +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..7052064 --- /dev/null +++ b/config.json @@ -0,0 +1,30 @@ +{ + "_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": 4096, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.49.0", + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..01dfe4b --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 151646, + "do_sample": true, + "eos_token_id": 151643, + "temperature": 0.6, + "top_p": 0.95, + "transformers_version": "4.49.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..198989b --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fa1ab875139569ba05600c726346b9d9097b97e305e3add4d3e3ec89c8bc4fe +size 3554214752 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..1d385d6 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..e7cd2c1 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a +size 11422959 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..ef6e98c --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,195 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..c51d466 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": 0.0008326698157195504, + "train_runtime": 30165.0377, + "train_samples": 7000, + "train_samples_per_second": 0.239, + "train_steps_per_second": 0.01 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..a03554d --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,3942 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0273972602739727, + "eval_steps": 500, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 2687.75, + "epoch": 0.003424657534246575, + "grad_norm": 0.18900136649608612, + "kl": 0.0, + "learning_rate": 3.3333333333333334e-08, + "loss": 0.0, + "reward": 0.3042634315788746, + "reward_std": 0.4194334000349045, + "rewards/cosine_scaled_reward": -0.14712543413043022, + "rewards/format_reward": 0.4513889104127884, + "step": 1 + }, + { + "completion_length": 2708.71533203125, + "epoch": 0.00684931506849315, + "grad_norm": 0.18016651272773743, + "kl": 0.0, + "learning_rate": 6.666666666666667e-08, + "loss": 0.0, + "reward": 0.3438135087490082, + "reward_std": 0.4610961228609085, + "rewards/cosine_scaled_reward": -0.1353531926870346, + "rewards/format_reward": 0.4791666716337204, + "step": 2 + }, + { + "completion_length": 2606.2569580078125, + "epoch": 0.010273972602739725, + "grad_norm": 0.20038573443889618, + "kl": 9.512901306152344e-05, + "learning_rate": 1e-07, + "loss": 0.0, + "reward": 0.4459744915366173, + "reward_std": 0.4223913550376892, + "rewards/cosine_scaled_reward": -0.033192168921232224, + "rewards/format_reward": 0.4791666567325592, + "step": 3 + }, + { + "completion_length": 2475.6805419921875, + "epoch": 0.0136986301369863, + "grad_norm": 0.288876473903656, + "kl": 0.00010967254638671875, + "learning_rate": 1.3333333333333334e-07, + "loss": 0.0, + "reward": 0.5504279434680939, + "reward_std": 0.4054145812988281, + "rewards/cosine_scaled_reward": -0.07457206398248672, + "rewards/format_reward": 0.625, + "step": 4 + }, + { + "completion_length": 2784.541748046875, + "epoch": 0.017123287671232876, + "grad_norm": 0.14747899770736694, + "kl": 0.00010347366333007812, + "learning_rate": 1.6666666666666665e-07, + "loss": 0.0, + "reward": 0.3733392059803009, + "reward_std": 0.4893380403518677, + "rewards/cosine_scaled_reward": -0.057216365821659565, + "rewards/format_reward": 0.4305555522441864, + "step": 5 + }, + { + "completion_length": 3103.0902099609375, + "epoch": 0.02054794520547945, + "grad_norm": 0.1839321404695511, + "kl": 0.00011086463928222656, + "learning_rate": 2e-07, + "loss": 0.0, + "reward": 0.02957332320511341, + "reward_std": 0.3297805115580559, + "rewards/cosine_scaled_reward": -0.22042667865753174, + "rewards/format_reward": 0.2499999925494194, + "step": 6 + }, + { + "completion_length": 2373.9375, + "epoch": 0.023972602739726026, + "grad_norm": 0.22951197624206543, + "kl": 0.00012159347534179688, + "learning_rate": 2.3333333333333333e-07, + "loss": 0.0, + "reward": 0.4939586818218231, + "reward_std": 0.3855544626712799, + "rewards/cosine_scaled_reward": -0.04770803824067116, + "rewards/format_reward": 0.5416666716337204, + "step": 7 + }, + { + "completion_length": 2863.65283203125, + "epoch": 0.0273972602739726, + "grad_norm": 0.1952303647994995, + "kl": 0.00010585784912109375, + "learning_rate": 2.6666666666666667e-07, + "loss": 0.0, + "reward": 0.17456580698490143, + "reward_std": 0.4745761901140213, + "rewards/cosine_scaled_reward": -0.15182308107614517, + "rewards/format_reward": 0.3263888880610466, + "step": 8 + }, + { + "completion_length": 3127.5833740234375, + "epoch": 0.030821917808219176, + "grad_norm": 0.12419066578149796, + "kl": 0.00010228157043457031, + "learning_rate": 3e-07, + "loss": 0.0, + "reward": 0.10362935438752174, + "reward_std": 0.3372735381126404, + "rewards/cosine_scaled_reward": -0.15331509709358215, + "rewards/format_reward": 0.2569444552063942, + "step": 9 + }, + { + "completion_length": 3021.9722900390625, + "epoch": 0.03424657534246575, + "grad_norm": 0.1988951563835144, + "kl": 0.00011277198791503906, + "learning_rate": 3.333333333333333e-07, + "loss": 0.0, + "reward": 0.13170428201556206, + "reward_std": 0.41356146335601807, + "rewards/cosine_scaled_reward": -0.16690683364868164, + "rewards/format_reward": 0.298611119389534, + "step": 10 + }, + { + "completion_length": 2931.1319580078125, + "epoch": 0.03767123287671233, + "grad_norm": 0.18511006236076355, + "kl": 0.0001304149627685547, + "learning_rate": 3.666666666666666e-07, + "loss": 0.0, + "reward": 0.1742808436974883, + "reward_std": 0.32126323878765106, + "rewards/cosine_scaled_reward": -0.17294137924909592, + "rewards/format_reward": 0.3472222238779068, + "step": 11 + }, + { + "completion_length": 2932.5555419921875, + "epoch": 0.0410958904109589, + "grad_norm": 0.16578702628612518, + "kl": 0.00010228157043457031, + "learning_rate": 4e-07, + "loss": 0.0, + "reward": 0.19121046364307404, + "reward_std": 0.3839987516403198, + "rewards/cosine_scaled_reward": -0.16295619308948517, + "rewards/format_reward": 0.3541666567325592, + "step": 12 + }, + { + "completion_length": 2798.6180419921875, + "epoch": 0.04452054794520548, + "grad_norm": 0.15695109963417053, + "kl": 9.846687316894531e-05, + "learning_rate": 4.3333333333333335e-07, + "loss": 0.0, + "reward": 0.40019115805625916, + "reward_std": 0.49739648401737213, + "rewards/cosine_scaled_reward": -0.05814217543229461, + "rewards/format_reward": 0.4583333432674408, + "step": 13 + }, + { + "completion_length": 2636.548583984375, + "epoch": 0.04794520547945205, + "grad_norm": 0.19918033480644226, + "kl": 0.00012230873107910156, + "learning_rate": 4.6666666666666666e-07, + "loss": 0.0, + "reward": 0.5720034390687943, + "reward_std": 0.46172526478767395, + "rewards/cosine_scaled_reward": 0.009503423236310482, + "rewards/format_reward": 0.5625, + "step": 14 + }, + { + "completion_length": 2823.763916015625, + "epoch": 0.05136986301369863, + "grad_norm": 0.13800376653671265, + "kl": 0.00010466575622558594, + "learning_rate": 5e-07, + "loss": 0.0, + "reward": 0.46923990547657013, + "reward_std": 0.5656772404909134, + "rewards/cosine_scaled_reward": -0.037704543210566044, + "rewards/format_reward": 0.5069444477558136, + "step": 15 + }, + { + "completion_length": 2824.2777099609375, + "epoch": 0.0547945205479452, + "grad_norm": 0.19206716120243073, + "kl": 0.00011682510375976562, + "learning_rate": 5.333333333333333e-07, + "loss": 0.0, + "reward": 0.23321796208620071, + "reward_std": 0.41186313331127167, + "rewards/cosine_scaled_reward": -0.10011539235711098, + "rewards/format_reward": 0.3333333432674408, + "step": 16 + }, + { + "completion_length": 2954.1112060546875, + "epoch": 0.05821917808219178, + "grad_norm": 0.19295988976955414, + "kl": 0.00010442733764648438, + "learning_rate": 5.666666666666666e-07, + "loss": 0.0, + "reward": 0.28434962406754494, + "reward_std": 0.5011897683143616, + "rewards/cosine_scaled_reward": -0.07676149532198906, + "rewards/format_reward": 0.361111119389534, + "step": 17 + }, + { + "completion_length": 2896.78466796875, + "epoch": 0.06164383561643835, + "grad_norm": 0.16591955721378326, + "kl": 0.00012493133544921875, + "learning_rate": 6e-07, + "loss": 0.0, + "reward": 0.1477920152246952, + "reward_std": 0.3473651483654976, + "rewards/cosine_scaled_reward": -0.1855413243174553, + "rewards/format_reward": 0.3333333432674408, + "step": 18 + }, + { + "completion_length": 2883.125, + "epoch": 0.06506849315068493, + "grad_norm": 0.24647116661071777, + "kl": 0.0001239776611328125, + "learning_rate": 6.333333333333332e-07, + "loss": 0.0, + "reward": 0.23157373815774918, + "reward_std": 0.3507264107465744, + "rewards/cosine_scaled_reward": -0.10870405659079552, + "rewards/format_reward": 0.3402777761220932, + "step": 19 + }, + { + "completion_length": 2961.7362060546875, + "epoch": 0.0684931506849315, + "grad_norm": 0.18728116154670715, + "kl": 0.000133514404296875, + "learning_rate": 6.666666666666666e-07, + "loss": 0.0, + "reward": 0.17506830394268036, + "reward_std": 0.30466167628765106, + "rewards/cosine_scaled_reward": -0.13743170350790024, + "rewards/format_reward": 0.3125, + "step": 20 + }, + { + "completion_length": 2186.2501220703125, + "epoch": 0.07191780821917808, + "grad_norm": 0.21891450881958008, + "kl": 0.00015282630920410156, + "learning_rate": 7e-07, + "loss": 0.0, + "reward": 0.8545757234096527, + "reward_std": 0.4611281454563141, + "rewards/cosine_scaled_reward": 0.1392979435622692, + "rewards/format_reward": 0.715277761220932, + "step": 21 + }, + { + "completion_length": 2804.2291259765625, + "epoch": 0.07534246575342465, + "grad_norm": 0.17504653334617615, + "kl": 0.000141143798828125, + "learning_rate": 7.333333333333332e-07, + "loss": 0.0, + "reward": 0.26354077458381653, + "reward_std": 0.30501972138881683, + "rewards/cosine_scaled_reward": -0.16007035970687866, + "rewards/format_reward": 0.423611119389534, + "step": 22 + }, + { + "completion_length": 2849.8958740234375, + "epoch": 0.07876712328767123, + "grad_norm": 0.16187402606010437, + "kl": 0.00014853477478027344, + "learning_rate": 7.666666666666667e-07, + "loss": 0.0, + "reward": 0.2565463110804558, + "reward_std": 0.4471036493778229, + "rewards/cosine_scaled_reward": -0.11150925606489182, + "rewards/format_reward": 0.3680555522441864, + "step": 23 + }, + { + "completion_length": 2847.166748046875, + "epoch": 0.0821917808219178, + "grad_norm": 0.1545354276895523, + "kl": 0.0001323223114013672, + "learning_rate": 8e-07, + "loss": 0.0, + "reward": 0.4566986709833145, + "reward_std": 0.5350392460823059, + "rewards/cosine_scaled_reward": -0.015523582696914673, + "rewards/format_reward": 0.4722222238779068, + "step": 24 + }, + { + "completion_length": 2807.2708740234375, + "epoch": 0.08561643835616438, + "grad_norm": 0.18177877366542816, + "kl": 0.00020647048950195312, + "learning_rate": 8.333333333333333e-07, + "loss": 0.0, + "reward": 0.21164313331246376, + "reward_std": 0.3955174386501312, + "rewards/cosine_scaled_reward": -0.15641241893172264, + "rewards/format_reward": 0.3680555522441864, + "step": 25 + }, + { + "completion_length": 2927.194580078125, + "epoch": 0.08904109589041095, + "grad_norm": 0.17393368482589722, + "kl": 0.00020933151245117188, + "learning_rate": 8.666666666666667e-07, + "loss": 0.0, + "reward": 0.370952308177948, + "reward_std": 0.4646989554166794, + "rewards/cosine_scaled_reward": -0.017936568707227707, + "rewards/format_reward": 0.388888880610466, + "step": 26 + }, + { + "completion_length": 2989.7291259765625, + "epoch": 0.09246575342465753, + "grad_norm": 0.19562838971614838, + "kl": 0.00015878677368164062, + "learning_rate": 9e-07, + "loss": 0.0, + "reward": 0.10655032098293304, + "reward_std": 0.4004325717687607, + "rewards/cosine_scaled_reward": -0.20594968646764755, + "rewards/format_reward": 0.3125, + "step": 27 + }, + { + "completion_length": 3012.5555419921875, + "epoch": 0.0958904109589041, + "grad_norm": 0.15751628577709198, + "kl": 0.00021696090698242188, + "learning_rate": 9.333333333333333e-07, + "loss": 0.0, + "reward": 0.3497147411108017, + "reward_std": 0.6247645616531372, + "rewards/cosine_scaled_reward": -0.0738963820040226, + "rewards/format_reward": 0.423611119389534, + "step": 28 + }, + { + "completion_length": 3110.6181640625, + "epoch": 0.09931506849315068, + "grad_norm": 0.13668763637542725, + "kl": 0.00023889541625976562, + "learning_rate": 9.666666666666666e-07, + "loss": 0.0, + "reward": 0.10539204813539982, + "reward_std": 0.30441631376743317, + "rewards/cosine_scaled_reward": -0.17933017387986183, + "rewards/format_reward": 0.2847222238779068, + "step": 29 + }, + { + "completion_length": 2887.2083740234375, + "epoch": 0.10273972602739725, + "grad_norm": 0.17280973494052887, + "kl": 0.00022649765014648438, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 0.4436980187892914, + "reward_std": 0.5404154658317566, + "rewards/cosine_scaled_reward": 0.006197985261678696, + "rewards/format_reward": 0.4375, + "step": 30 + }, + { + "completion_length": 2932.757080078125, + "epoch": 0.10616438356164383, + "grad_norm": 0.15570040047168732, + "kl": 0.00044155120849609375, + "learning_rate": 9.99969538601693e-07, + "loss": 0.0, + "reward": 0.16736368834972382, + "reward_std": 0.4045102745294571, + "rewards/cosine_scaled_reward": -0.15208075568079948, + "rewards/format_reward": 0.3194444477558136, + "step": 31 + }, + { + "completion_length": 3094.4444580078125, + "epoch": 0.1095890410958904, + "grad_norm": 0.14696453511714935, + "kl": 0.0003809928894042969, + "learning_rate": 9.998781585307575e-07, + "loss": 0.0, + "reward": 0.1443577939644456, + "reward_std": 0.5129190236330032, + "rewards/cosine_scaled_reward": -0.16119776666164398, + "rewards/format_reward": 0.3055555522441864, + "step": 32 + }, + { + "completion_length": 2546.0556640625, + "epoch": 0.11301369863013698, + "grad_norm": 0.1807711124420166, + "kl": 0.00067138671875, + "learning_rate": 9.997258721585931e-07, + "loss": 0.0, + "reward": 0.5040995180606842, + "reward_std": 0.43710532784461975, + "rewards/cosine_scaled_reward": 0.011043965816497803, + "rewards/format_reward": 0.4930555671453476, + "step": 33 + }, + { + "completion_length": 2832.5902099609375, + "epoch": 0.11643835616438356, + "grad_norm": 0.16727592051029205, + "kl": 0.000640869140625, + "learning_rate": 9.99512700102336e-07, + "loss": 0.0, + "reward": 0.24528168514370918, + "reward_std": 0.4029017984867096, + "rewards/cosine_scaled_reward": -0.15749610401690006, + "rewards/format_reward": 0.4027777910232544, + "step": 34 + }, + { + "completion_length": 2865.923583984375, + "epoch": 0.11986301369863013, + "grad_norm": 0.1456187218427658, + "kl": 0.0006847381591796875, + "learning_rate": 9.992386712220707e-07, + "loss": 0.0, + "reward": 0.27862629294395447, + "reward_std": 0.37985116243362427, + "rewards/cosine_scaled_reward": -0.11026258394122124, + "rewards/format_reward": 0.3888888955116272, + "step": 35 + }, + { + "completion_length": 2459.1527709960938, + "epoch": 0.1232876712328767, + "grad_norm": 0.16684532165527344, + "kl": 0.0010623931884765625, + "learning_rate": 9.989038226169207e-07, + "loss": 0.0, + "reward": 0.5571352988481522, + "reward_std": 0.4538237154483795, + "rewards/cosine_scaled_reward": 0.015468628145754337, + "rewards/format_reward": 0.5416666865348816, + "step": 36 + }, + { + "completion_length": 2674.763916015625, + "epoch": 0.1267123287671233, + "grad_norm": 0.15782181918621063, + "kl": 0.001285552978515625, + "learning_rate": 9.985081996200277e-07, + "loss": 0.0001, + "reward": 0.3798440098762512, + "reward_std": 0.48301415145397186, + "rewards/cosine_scaled_reward": -0.08543377462774515, + "rewards/format_reward": 0.4652777910232544, + "step": 37 + }, + { + "completion_length": 2692.986083984375, + "epoch": 0.13013698630136986, + "grad_norm": 0.16332699358463287, + "kl": 0.0014743804931640625, + "learning_rate": 9.98051855792412e-07, + "loss": 0.0001, + "reward": 0.38910074532032013, + "reward_std": 0.3475951850414276, + "rewards/cosine_scaled_reward": -0.03451040526852012, + "rewards/format_reward": 0.423611119389534, + "step": 38 + }, + { + "completion_length": 3102.4930419921875, + "epoch": 0.13356164383561644, + "grad_norm": 0.1224745586514473, + "kl": 0.00080108642578125, + "learning_rate": 9.975348529157229e-07, + "loss": 0.0, + "reward": 0.22906357236206532, + "reward_std": 0.5051266252994537, + "rewards/cosine_scaled_reward": -0.10426976904273033, + "rewards/format_reward": 0.3333333358168602, + "step": 39 + }, + { + "completion_length": 2613.0347900390625, + "epoch": 0.136986301369863, + "grad_norm": 0.2610801160335541, + "kl": 0.00220489501953125, + "learning_rate": 9.969572609838744e-07, + "loss": 0.0001, + "reward": 0.39406873285770416, + "reward_std": 0.5201945602893829, + "rewards/cosine_scaled_reward": -0.13370903208851814, + "rewards/format_reward": 0.5277777910232544, + "step": 40 + }, + { + "completion_length": 2810.84033203125, + "epoch": 0.1404109589041096, + "grad_norm": 0.1520536243915558, + "kl": 0.00159454345703125, + "learning_rate": 9.963191581935677e-07, + "loss": 0.0001, + "reward": 0.26414267159998417, + "reward_std": 0.39119474589824677, + "rewards/cosine_scaled_reward": -0.12474621459841728, + "rewards/format_reward": 0.3888888955116272, + "step": 41 + }, + { + "completion_length": 2832.3958740234375, + "epoch": 0.14383561643835616, + "grad_norm": 0.16354550421237946, + "kl": 0.001583099365234375, + "learning_rate": 9.956206309337066e-07, + "loss": 0.0001, + "reward": 0.1603957675397396, + "reward_std": 0.2545732408761978, + "rewards/cosine_scaled_reward": -0.19377091526985168, + "rewards/format_reward": 0.3541666716337204, + "step": 42 + }, + { + "completion_length": 2594.4166259765625, + "epoch": 0.14726027397260275, + "grad_norm": 0.1575089693069458, + "kl": 0.002162933349609375, + "learning_rate": 9.948617737737001e-07, + "loss": 0.0001, + "reward": 0.4924927353858948, + "reward_std": 0.4764625281095505, + "rewards/cosine_scaled_reward": -0.05611838772892952, + "rewards/format_reward": 0.5486111044883728, + "step": 43 + }, + { + "completion_length": 3142.638916015625, + "epoch": 0.1506849315068493, + "grad_norm": 0.13056397438049316, + "kl": 0.0015392303466796875, + "learning_rate": 9.940426894506606e-07, + "loss": 0.0001, + "reward": 0.18407823704183102, + "reward_std": 0.40585757791996, + "rewards/cosine_scaled_reward": -0.0867550881812349, + "rewards/format_reward": 0.2708333432674408, + "step": 44 + }, + { + "completion_length": 2841.7501220703125, + "epoch": 0.1541095890410959, + "grad_norm": 0.1490578055381775, + "kl": 0.0020904541015625, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0001, + "reward": 0.24261729046702385, + "reward_std": 0.4036310315132141, + "rewards/cosine_scaled_reward": -0.11849382892251015, + "rewards/format_reward": 0.3611111119389534, + "step": 45 + }, + { + "completion_length": 2715.6944580078125, + "epoch": 0.15753424657534246, + "grad_norm": 0.18108582496643066, + "kl": 0.00254058837890625, + "learning_rate": 9.922242910178859e-07, + "loss": 0.0001, + "reward": 0.40730662643909454, + "reward_std": 0.5260264724493027, + "rewards/cosine_scaled_reward": -0.05797116830945015, + "rewards/format_reward": 0.4652777761220932, + "step": 46 + }, + { + "completion_length": 2995.9862060546875, + "epoch": 0.16095890410958905, + "grad_norm": 0.23772801458835602, + "kl": 0.0029754638671875, + "learning_rate": 9.912252230901906e-07, + "loss": 0.0001, + "reward": 0.28154853731393814, + "reward_std": 0.44985754787921906, + "rewards/cosine_scaled_reward": -0.0726181073114276, + "rewards/format_reward": 0.3541666641831398, + "step": 47 + }, + { + "completion_length": 2810.7222900390625, + "epoch": 0.1643835616438356, + "grad_norm": 0.1889384686946869, + "kl": 0.00370025634765625, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0001, + "reward": 0.3217255771160126, + "reward_std": 0.5001529008150101, + "rewards/cosine_scaled_reward": -0.08105220319703221, + "rewards/format_reward": 0.4027777761220932, + "step": 48 + }, + { + "completion_length": 2650.09033203125, + "epoch": 0.1678082191780822, + "grad_norm": 0.13828006386756897, + "kl": 0.00315093994140625, + "learning_rate": 9.890480260828965e-07, + "loss": 0.0001, + "reward": 0.37053926289081573, + "reward_std": 0.40743280947208405, + "rewards/cosine_scaled_reward": -0.12251630332320929, + "rewards/format_reward": 0.4930555522441864, + "step": 49 + }, + { + "completion_length": 2785.84716796875, + "epoch": 0.17123287671232876, + "grad_norm": 0.16287721693515778, + "kl": 0.00333404541015625, + "learning_rate": 9.878701917609207e-07, + "loss": 0.0001, + "reward": 0.3723617196083069, + "reward_std": 0.4880683571100235, + "rewards/cosine_scaled_reward": -0.09291603974997997, + "rewards/format_reward": 0.4652777910232544, + "step": 50 + }, + { + "completion_length": 2878.1181640625, + "epoch": 0.17465753424657535, + "grad_norm": 0.13556380569934845, + "kl": 0.0032501220703125, + "learning_rate": 9.866330768241983e-07, + "loss": 0.0001, + "reward": 0.2918383926153183, + "reward_std": 0.3286859840154648, + "rewards/cosine_scaled_reward": -0.1109393835067749, + "rewards/format_reward": 0.4027777910232544, + "step": 51 + }, + { + "completion_length": 2817.638916015625, + "epoch": 0.1780821917808219, + "grad_norm": 0.16504673659801483, + "kl": 0.0037384033203125, + "learning_rate": 9.853368487582886e-07, + "loss": 0.0001, + "reward": 0.23075967282056808, + "reward_std": 0.3352076858282089, + "rewards/cosine_scaled_reward": -0.08868478238582611, + "rewards/format_reward": 0.3194444477558136, + "step": 52 + }, + { + "completion_length": 2916.1112060546875, + "epoch": 0.1815068493150685, + "grad_norm": 0.13884864747524261, + "kl": 0.00319671630859375, + "learning_rate": 9.839816830517225e-07, + "loss": 0.0001, + "reward": 0.28766703605651855, + "reward_std": 0.39385148882865906, + "rewards/cosine_scaled_reward": -0.09427741169929504, + "rewards/format_reward": 0.3819444477558136, + "step": 53 + }, + { + "completion_length": 2991.6041259765625, + "epoch": 0.18493150684931506, + "grad_norm": 0.1436394602060318, + "kl": 0.0031585693359375, + "learning_rate": 9.825677631722435e-07, + "loss": 0.0001, + "reward": 0.35537297278642654, + "reward_std": 0.39448249340057373, + "rewards/cosine_scaled_reward": -0.04740479774773121, + "rewards/format_reward": 0.4027777910232544, + "step": 54 + }, + { + "completion_length": 2785.173583984375, + "epoch": 0.18835616438356165, + "grad_norm": 0.18955738842487335, + "kl": 0.0048980712890625, + "learning_rate": 9.8109528054197e-07, + "loss": 0.0002, + "reward": 0.2277716025710106, + "reward_std": 0.4378567188978195, + "rewards/cosine_scaled_reward": -0.1263950616121292, + "rewards/format_reward": 0.3541666716337204, + "step": 55 + }, + { + "completion_length": 2579.388916015625, + "epoch": 0.1917808219178082, + "grad_norm": 0.1459513008594513, + "kl": 0.003570556640625, + "learning_rate": 9.795644345114794e-07, + "loss": 0.0001, + "reward": 0.35765571892261505, + "reward_std": 0.39826689660549164, + "rewards/cosine_scaled_reward": -0.11456651613116264, + "rewards/format_reward": 0.4722222238779068, + "step": 56 + }, + { + "completion_length": 2586.9305419921875, + "epoch": 0.1952054794520548, + "grad_norm": 0.20002588629722595, + "kl": 0.00665283203125, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0003, + "reward": 0.3011641800403595, + "reward_std": 0.397590771317482, + "rewards/cosine_scaled_reward": -0.1641135960817337, + "rewards/format_reward": 0.4652777761220932, + "step": 57 + }, + { + "completion_length": 2786.3333740234375, + "epoch": 0.19863013698630136, + "grad_norm": 0.15498584508895874, + "kl": 0.00461578369140625, + "learning_rate": 9.76328489131448e-07, + "loss": 0.0002, + "reward": 0.3455113209784031, + "reward_std": 0.38071802258491516, + "rewards/cosine_scaled_reward": -0.043377578258514404, + "rewards/format_reward": 0.3888889029622078, + "step": 58 + }, + { + "completion_length": 2466.763916015625, + "epoch": 0.20205479452054795, + "grad_norm": 0.14243252575397491, + "kl": 0.0069580078125, + "learning_rate": 9.746238278771125e-07, + "loss": 0.0003, + "reward": 0.5345294326543808, + "reward_std": 0.5026020854711533, + "rewards/cosine_scaled_reward": -0.021026152186095715, + "rewards/format_reward": 0.5555555522441864, + "step": 59 + }, + { + "completion_length": 2819.2083740234375, + "epoch": 0.2054794520547945, + "grad_norm": 0.13467197120189667, + "kl": 0.005340576171875, + "learning_rate": 9.728616793536587e-07, + "loss": 0.0002, + "reward": 0.32512621581554413, + "reward_std": 0.3411554992198944, + "rewards/cosine_scaled_reward": -0.07070711255073547, + "rewards/format_reward": 0.3958333432674408, + "step": 60 + }, + { + "completion_length": 2741.6944580078125, + "epoch": 0.2089041095890411, + "grad_norm": 0.1468200534582138, + "kl": 0.0046539306640625, + "learning_rate": 9.71042282127789e-07, + "loss": 0.0002, + "reward": 0.4192769527435303, + "reward_std": 0.4629937559366226, + "rewards/cosine_scaled_reward": -0.10850081220269203, + "rewards/format_reward": 0.5277777910232544, + "step": 61 + }, + { + "completion_length": 2945.27783203125, + "epoch": 0.21232876712328766, + "grad_norm": 0.14704585075378418, + "kl": 0.00518798828125, + "learning_rate": 9.69165882516764e-07, + "loss": 0.0002, + "reward": 0.35519421100616455, + "reward_std": 0.49601832032203674, + "rewards/cosine_scaled_reward": -0.019805820658802986, + "rewards/format_reward": 0.375, + "step": 62 + }, + { + "completion_length": 2946.166748046875, + "epoch": 0.21575342465753425, + "grad_norm": 0.14856047928333282, + "kl": 0.004547119140625, + "learning_rate": 9.672327345550543e-07, + "loss": 0.0002, + "reward": 0.14313821494579315, + "reward_std": 0.30480653047561646, + "rewards/cosine_scaled_reward": -0.16241733357310295, + "rewards/format_reward": 0.305555559694767, + "step": 63 + }, + { + "completion_length": 2631.2708740234375, + "epoch": 0.2191780821917808, + "grad_norm": 0.1544739007949829, + "kl": 0.0066070556640625, + "learning_rate": 9.65243099959949e-07, + "loss": 0.0003, + "reward": 0.3399986997246742, + "reward_std": 0.41406454145908356, + "rewards/cosine_scaled_reward": -0.0905568664893508, + "rewards/format_reward": 0.4305555522441864, + "step": 64 + }, + { + "completion_length": 2305.416748046875, + "epoch": 0.2226027397260274, + "grad_norm": 0.18752287328243256, + "kl": 0.011260986328125, + "learning_rate": 9.631972480961233e-07, + "loss": 0.0005, + "reward": 0.43726037442684174, + "reward_std": 0.25909677147865295, + "rewards/cosine_scaled_reward": -0.06273962743580341, + "rewards/format_reward": 0.5000000149011612, + "step": 65 + }, + { + "completion_length": 2983.9444580078125, + "epoch": 0.22602739726027396, + "grad_norm": 0.11286786943674088, + "kl": 0.00555419921875, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0002, + "reward": 0.2669789642095566, + "reward_std": 0.39550328254699707, + "rewards/cosine_scaled_reward": -0.12190994247794151, + "rewards/format_reward": 0.3888888955116272, + "step": 66 + }, + { + "completion_length": 2817.4652099609375, + "epoch": 0.22945205479452055, + "grad_norm": 0.12778045237064362, + "kl": 0.006805419921875, + "learning_rate": 9.589380080381038e-07, + "loss": 0.0003, + "reward": 0.37828030437231064, + "reward_std": 0.372231587767601, + "rewards/cosine_scaled_reward": -0.05921970750205219, + "rewards/format_reward": 0.4375000149011612, + "step": 67 + }, + { + "completion_length": 2652.541748046875, + "epoch": 0.2328767123287671, + "grad_norm": 0.21021802723407745, + "kl": 0.009674072265625, + "learning_rate": 9.567251964768342e-07, + "loss": 0.0004, + "reward": 0.36411239206790924, + "reward_std": 0.4231058210134506, + "rewards/cosine_scaled_reward": -0.04560980945825577, + "rewards/format_reward": 0.4097222238779068, + "step": 68 + }, + { + "completion_length": 2930.09716796875, + "epoch": 0.2363013698630137, + "grad_norm": 0.13323834538459778, + "kl": 0.0063323974609375, + "learning_rate": 9.54457320834625e-07, + "loss": 0.0003, + "reward": 0.2518671154975891, + "reward_std": 0.44373343884944916, + "rewards/cosine_scaled_reward": -0.10924399271607399, + "rewards/format_reward": 0.361111119389534, + "step": 69 + }, + { + "completion_length": 2892.1041259765625, + "epoch": 0.23972602739726026, + "grad_norm": 0.1291056126356125, + "kl": 0.0074005126953125, + "learning_rate": 9.521346881455354e-07, + "loss": 0.0003, + "reward": 0.32329247891902924, + "reward_std": 0.2955201715230942, + "rewards/cosine_scaled_reward": -0.04476307414006442, + "rewards/format_reward": 0.3680555522441864, + "step": 70 + }, + { + "completion_length": 3238.2847900390625, + "epoch": 0.24315068493150685, + "grad_norm": 0.12446684390306473, + "kl": 0.0060577392578125, + "learning_rate": 9.497576128568518e-07, + "loss": 0.0002, + "reward": 0.127724077552557, + "reward_std": 0.38546572625637054, + "rewards/cosine_scaled_reward": -0.129220362752676, + "rewards/format_reward": 0.256944440305233, + "step": 71 + }, + { + "completion_length": 2898.52783203125, + "epoch": 0.2465753424657534, + "grad_norm": 0.12787607312202454, + "kl": 0.007537841796875, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0003, + "reward": 0.21395007893443108, + "reward_std": 0.3605284094810486, + "rewards/cosine_scaled_reward": -0.14716103300452232, + "rewards/format_reward": 0.361111119389534, + "step": 72 + }, + { + "completion_length": 2566.1666259765625, + "epoch": 0.25, + "grad_norm": 0.16552697122097015, + "kl": 0.010162353515625, + "learning_rate": 9.448414290795618e-07, + "loss": 0.0004, + "reward": 0.5014020800590515, + "reward_std": 0.3501627743244171, + "rewards/cosine_scaled_reward": 0.015290968120098114, + "rewards/format_reward": 0.486111119389534, + "step": 73 + }, + { + "completion_length": 2927.8125, + "epoch": 0.2534246575342466, + "grad_norm": 0.1325031816959381, + "kl": 0.007843017578125, + "learning_rate": 9.42302986163543e-07, + "loss": 0.0003, + "reward": 0.20612417813390493, + "reward_std": 0.38437697291374207, + "rewards/cosine_scaled_reward": -0.1758202537894249, + "rewards/format_reward": 0.3819444477558136, + "step": 74 + }, + { + "completion_length": 2712.4583740234375, + "epoch": 0.2568493150684932, + "grad_norm": 0.1481809765100479, + "kl": 0.0088043212890625, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0004, + "reward": 0.46940816938877106, + "reward_std": 0.4842826575040817, + "rewards/cosine_scaled_reward": -0.002814057283103466, + "rewards/format_reward": 0.472222238779068, + "step": 75 + }, + { + "completion_length": 2792.3055419921875, + "epoch": 0.2602739726027397, + "grad_norm": 0.22191229462623596, + "kl": 0.0104827880859375, + "learning_rate": 9.370671165529144e-07, + "loss": 0.0004, + "reward": 0.3087327107787132, + "reward_std": 0.3974086493253708, + "rewards/cosine_scaled_reward": -0.07321174256503582, + "rewards/format_reward": 0.3819444477558136, + "step": 76 + }, + { + "completion_length": 2848.0625, + "epoch": 0.2636986301369863, + "grad_norm": 0.14868301153182983, + "kl": 0.009918212890625, + "learning_rate": 9.343703987112365e-07, + "loss": 0.0004, + "reward": 0.361818864941597, + "reward_std": 0.47596603631973267, + "rewards/cosine_scaled_reward": -0.09651443734765053, + "rewards/format_reward": 0.4583333432674408, + "step": 77 + }, + { + "completion_length": 3004.96533203125, + "epoch": 0.2671232876712329, + "grad_norm": 0.1331622451543808, + "kl": 0.007659912109375, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0003, + "reward": 0.4262048453092575, + "reward_std": 0.38969628512859344, + "rewards/cosine_scaled_reward": 0.030371490865945816, + "rewards/format_reward": 0.3958333283662796, + "step": 78 + }, + { + "completion_length": 2514.673583984375, + "epoch": 0.2705479452054795, + "grad_norm": 0.15792632102966309, + "kl": 0.010833740234375, + "learning_rate": 9.288212223678658e-07, + "loss": 0.0004, + "reward": 0.517068013548851, + "reward_std": 0.4114367365837097, + "rewards/cosine_scaled_reward": -0.003765310626477003, + "rewards/format_reward": 0.5208333283662796, + "step": 79 + }, + { + "completion_length": 2929.9652099609375, + "epoch": 0.273972602739726, + "grad_norm": 0.16112865507602692, + "kl": 0.009185791015625, + "learning_rate": 9.259695151358214e-07, + "loss": 0.0004, + "reward": 0.22491584718227386, + "reward_std": 0.3951188027858734, + "rewards/cosine_scaled_reward": -0.11536196433007717, + "rewards/format_reward": 0.3402777910232544, + "step": 80 + }, + { + "completion_length": 2740.84716796875, + "epoch": 0.2773972602739726, + "grad_norm": 0.16842201352119446, + "kl": 0.012237548828125, + "learning_rate": 9.230669076497687e-07, + "loss": 0.0005, + "reward": 0.4160406142473221, + "reward_std": 0.47858820855617523, + "rewards/cosine_scaled_reward": -0.007570529356598854, + "rewards/format_reward": 0.423611119389534, + "step": 81 + }, + { + "completion_length": 2845.5069580078125, + "epoch": 0.2808219178082192, + "grad_norm": 0.20275162160396576, + "kl": 0.009857177734375, + "learning_rate": 9.20113792876298e-07, + "loss": 0.0004, + "reward": 0.3008575513958931, + "reward_std": 0.46689480543136597, + "rewards/cosine_scaled_reward": -0.088031355291605, + "rewards/format_reward": 0.3888888955116272, + "step": 82 + }, + { + "completion_length": 2746.25, + "epoch": 0.2842465753424658, + "grad_norm": 0.186171293258667, + "kl": 0.013397216796875, + "learning_rate": 9.171105706198774e-07, + "loss": 0.0005, + "reward": 0.2251388169825077, + "reward_std": 0.49258220195770264, + "rewards/cosine_scaled_reward": -0.1568056344985962, + "rewards/format_reward": 0.3819444477558136, + "step": 83 + }, + { + "completion_length": 2873.1805419921875, + "epoch": 0.2876712328767123, + "grad_norm": 0.1654582917690277, + "kl": 0.00982666015625, + "learning_rate": 9.140576474687263e-07, + "loss": 0.0004, + "reward": 0.33728019893169403, + "reward_std": 0.4015253335237503, + "rewards/cosine_scaled_reward": -0.05855315364897251, + "rewards/format_reward": 0.3958333432674408, + "step": 84 + }, + { + "completion_length": 2642.9376220703125, + "epoch": 0.2910958904109589, + "grad_norm": 0.1457146555185318, + "kl": 0.01275634765625, + "learning_rate": 9.109554367397697e-07, + "loss": 0.0005, + "reward": 0.5559843331575394, + "reward_std": 0.39349929988384247, + "rewards/cosine_scaled_reward": 0.0004287753254175186, + "rewards/format_reward": 0.5555555522441864, + "step": 85 + }, + { + "completion_length": 2919.9027099609375, + "epoch": 0.2945205479452055, + "grad_norm": 0.12657979130744934, + "kl": 0.0092926025390625, + "learning_rate": 9.078043584226815e-07, + "loss": 0.0004, + "reward": 0.4835028350353241, + "reward_std": 0.4722355157136917, + "rewards/cosine_scaled_reward": -0.009552719071507454, + "rewards/format_reward": 0.4930555671453476, + "step": 86 + }, + { + "completion_length": 2724.104248046875, + "epoch": 0.2979452054794521, + "grad_norm": 0.16625893115997314, + "kl": 0.0166015625, + "learning_rate": 9.046048391230247e-07, + "loss": 0.0007, + "reward": 0.23857301846146584, + "reward_std": 0.30345526337623596, + "rewards/cosine_scaled_reward": -0.09476033598184586, + "rewards/format_reward": 0.3333333283662796, + "step": 87 + }, + { + "completion_length": 3015.486083984375, + "epoch": 0.3013698630136986, + "grad_norm": 0.15486519038677216, + "kl": 0.01129150390625, + "learning_rate": 9.013573120044966e-07, + "loss": 0.0005, + "reward": 0.197230139747262, + "reward_std": 0.4105876684188843, + "rewards/cosine_scaled_reward": -0.15693652629852295, + "rewards/format_reward": 0.3541666716337204, + "step": 88 + }, + { + "completion_length": 3045.7708740234375, + "epoch": 0.3047945205479452, + "grad_norm": 0.11665515601634979, + "kl": 0.00848388671875, + "learning_rate": 8.980622167302837e-07, + "loss": 0.0003, + "reward": 0.2519669234752655, + "reward_std": 0.37049752473831177, + "rewards/cosine_scaled_reward": -0.07442197389900684, + "rewards/format_reward": 0.3263888955116272, + "step": 89 + }, + { + "completion_length": 2415.263916015625, + "epoch": 0.3082191780821918, + "grad_norm": 0.14136534929275513, + "kl": 0.01422119140625, + "learning_rate": 8.9471999940354e-07, + "loss": 0.0006, + "reward": 0.4783553332090378, + "reward_std": 0.37421566247940063, + "rewards/cosine_scaled_reward": -0.05636689253151417, + "rewards/format_reward": 0.5347222238779068, + "step": 90 + }, + { + "completion_length": 3079.4166259765625, + "epoch": 0.3116438356164384, + "grad_norm": 0.1431264877319336, + "kl": 0.011749267578125, + "learning_rate": 8.91331112506991e-07, + "loss": 0.0005, + "reward": 0.1510103940963745, + "reward_std": 0.4773586541414261, + "rewards/cosine_scaled_reward": -0.1684340313076973, + "rewards/format_reward": 0.3194444477558136, + "step": 91 + }, + { + "completion_length": 2879.6458740234375, + "epoch": 0.3150684931506849, + "grad_norm": 0.16839326918125153, + "kl": 0.013824462890625, + "learning_rate": 8.878960148416747e-07, + "loss": 0.0006, + "reward": 0.36638200283050537, + "reward_std": 0.4755648225545883, + "rewards/cosine_scaled_reward": -0.07111799996346235, + "rewards/format_reward": 0.4375, + "step": 92 + }, + { + "completion_length": 2882.326416015625, + "epoch": 0.3184931506849315, + "grad_norm": 0.14426778256893158, + "kl": 0.01336669921875, + "learning_rate": 8.844151714648274e-07, + "loss": 0.0005, + "reward": 0.41798925399780273, + "reward_std": 0.3813701719045639, + "rewards/cosine_scaled_reward": 0.015211460180580616, + "rewards/format_reward": 0.4027777910232544, + "step": 93 + }, + { + "completion_length": 2881.97216796875, + "epoch": 0.3219178082191781, + "grad_norm": 0.13846907019615173, + "kl": 0.012542724609375, + "learning_rate": 8.808890536269229e-07, + "loss": 0.0005, + "reward": 0.3407672867178917, + "reward_std": 0.5038859099149704, + "rewards/cosine_scaled_reward": -0.05506602302193642, + "rewards/format_reward": 0.3958333432674408, + "step": 94 + }, + { + "completion_length": 3092.4375, + "epoch": 0.3253424657534247, + "grad_norm": 0.1459702104330063, + "kl": 0.011962890625, + "learning_rate": 8.773181387078719e-07, + "loss": 0.0005, + "reward": 0.0870569609105587, + "reward_std": 0.36285020411014557, + "rewards/cosine_scaled_reward": -0.1837763711810112, + "rewards/format_reward": 0.2708333283662796, + "step": 95 + }, + { + "completion_length": 2850.486083984375, + "epoch": 0.3287671232876712, + "grad_norm": 0.16794843971729279, + "kl": 0.016632080078125, + "learning_rate": 8.737029101523929e-07, + "loss": 0.0007, + "reward": 0.2914813682436943, + "reward_std": 0.40039965510368347, + "rewards/cosine_scaled_reward": -0.12518527917563915, + "rewards/format_reward": 0.4166666716337204, + "step": 96 + }, + { + "completion_length": 2382.861083984375, + "epoch": 0.3321917808219178, + "grad_norm": 0.17770880460739136, + "kl": 0.01910400390625, + "learning_rate": 8.700438574045617e-07, + "loss": 0.0008, + "reward": 0.5590852797031403, + "reward_std": 0.3997005224227905, + "rewards/cosine_scaled_reward": -0.010359160602092743, + "rewards/format_reward": 0.5694444477558136, + "step": 97 + }, + { + "completion_length": 2623.6181640625, + "epoch": 0.3356164383561644, + "grad_norm": 0.14203575253486633, + "kl": 0.0145263671875, + "learning_rate": 8.663414758415478e-07, + "loss": 0.0006, + "reward": 0.4303872585296631, + "reward_std": 0.41416965425014496, + "rewards/cosine_scaled_reward": -0.055723853409290314, + "rewards/format_reward": 0.486111119389534, + "step": 98 + }, + { + "completion_length": 2726.8056640625, + "epoch": 0.339041095890411, + "grad_norm": 0.14974938333034515, + "kl": 0.014617919921875, + "learning_rate": 8.625962667065487e-07, + "loss": 0.0006, + "reward": 0.5150108188390732, + "reward_std": 0.42872530221939087, + "rewards/cosine_scaled_reward": 0.02889970690011978, + "rewards/format_reward": 0.486111119389534, + "step": 99 + }, + { + "completion_length": 2724.8472900390625, + "epoch": 0.3424657534246575, + "grad_norm": 0.15371288359165192, + "kl": 0.015380859375, + "learning_rate": 8.588087370409302e-07, + "loss": 0.0006, + "reward": 0.31959572434425354, + "reward_std": 0.4421093314886093, + "rewards/cosine_scaled_reward": -0.06234869919717312, + "rewards/format_reward": 0.3819444477558136, + "step": 100 + }, + { + "completion_length": 2685.0069580078125, + "epoch": 0.3458904109589041, + "grad_norm": 0.15011949837207794, + "kl": 0.017578125, + "learning_rate": 8.549793996155795e-07, + "loss": 0.0007, + "reward": 0.28529858589172363, + "reward_std": 0.3051328808069229, + "rewards/cosine_scaled_reward": -0.14525696635246277, + "rewards/format_reward": 0.4305555522441864, + "step": 101 + }, + { + "completion_length": 2744.3056640625, + "epoch": 0.3493150684931507, + "grad_norm": 0.16087760031223297, + "kl": 0.015289306640625, + "learning_rate": 8.511087728614862e-07, + "loss": 0.0006, + "reward": 0.4498697370290756, + "reward_std": 0.459157794713974, + "rewards/cosine_scaled_reward": 0.01931417128071189, + "rewards/format_reward": 0.4305555522441864, + "step": 102 + }, + { + "completion_length": 2870.5208740234375, + "epoch": 0.3527397260273973, + "grad_norm": 0.1468544751405716, + "kl": 0.01470947265625, + "learning_rate": 8.471973807995534e-07, + "loss": 0.0006, + "reward": 0.3780040740966797, + "reward_std": 0.5862710475921631, + "rewards/cosine_scaled_reward": -0.08727369178086519, + "rewards/format_reward": 0.4652777761220932, + "step": 103 + }, + { + "completion_length": 2846.5069580078125, + "epoch": 0.3561643835616438, + "grad_norm": 0.1548270583152771, + "kl": 0.018310546875, + "learning_rate": 8.432457529696548e-07, + "loss": 0.0007, + "reward": 0.35155677795410156, + "reward_std": 0.4553772062063217, + "rewards/cosine_scaled_reward": -0.0720543134957552, + "rewards/format_reward": 0.423611119389534, + "step": 104 + }, + { + "completion_length": 2724.076416015625, + "epoch": 0.3595890410958904, + "grad_norm": 0.1372641921043396, + "kl": 0.01708984375, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0007, + "reward": 0.23305706679821014, + "reward_std": 0.36483894288539886, + "rewards/cosine_scaled_reward": -0.16277626901865005, + "rewards/format_reward": 0.3958333283662796, + "step": 105 + }, + { + "completion_length": 2911.5069580078125, + "epoch": 0.363013698630137, + "grad_norm": 0.173682302236557, + "kl": 0.0185546875, + "learning_rate": 8.352239353294194e-07, + "loss": 0.0007, + "reward": 0.3065890637226403, + "reward_std": 0.30855099856853485, + "rewards/cosine_scaled_reward": -0.03368869423866272, + "rewards/format_reward": 0.3402777761220932, + "step": 106 + }, + { + "completion_length": 3053.5555419921875, + "epoch": 0.3664383561643836, + "grad_norm": 0.1345294862985611, + "kl": 0.01544189453125, + "learning_rate": 8.31154831544782e-07, + "loss": 0.0006, + "reward": 0.28517407923936844, + "reward_std": 0.4504627585411072, + "rewards/cosine_scaled_reward": -0.11065925285220146, + "rewards/format_reward": 0.3958333283662796, + "step": 107 + }, + { + "completion_length": 3053.4583740234375, + "epoch": 0.3698630136986301, + "grad_norm": 0.12478786706924438, + "kl": 0.015869140625, + "learning_rate": 8.270476638965461e-07, + "loss": 0.0006, + "reward": 0.18328540516085923, + "reward_std": 0.31476570665836334, + "rewards/cosine_scaled_reward": -0.14310350455343723, + "rewards/format_reward": 0.3263888880610466, + "step": 108 + }, + { + "completion_length": 2813.4444580078125, + "epoch": 0.3732876712328767, + "grad_norm": 0.13302823901176453, + "kl": 0.013427734375, + "learning_rate": 8.229029884294662e-07, + "loss": 0.0005, + "reward": 0.4454272836446762, + "reward_std": 0.4187168627977371, + "rewards/cosine_scaled_reward": 0.0009828601032495499, + "rewards/format_reward": 0.4444444477558136, + "step": 109 + }, + { + "completion_length": 3167.1875, + "epoch": 0.3767123287671233, + "grad_norm": 0.13403479754924774, + "kl": 0.013946533203125, + "learning_rate": 8.187213662662538e-07, + "loss": 0.0006, + "reward": 0.12785961106419563, + "reward_std": 0.4516500234603882, + "rewards/cosine_scaled_reward": -0.19158484041690826, + "rewards/format_reward": 0.3194444477558136, + "step": 110 + }, + { + "completion_length": 3124.513916015625, + "epoch": 0.3801369863013699, + "grad_norm": 0.13928255438804626, + "kl": 0.0164794921875, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0007, + "reward": 0.1088075079023838, + "reward_std": 0.40094128996133804, + "rewards/cosine_scaled_reward": -0.1828591525554657, + "rewards/format_reward": 0.291666679084301, + "step": 111 + }, + { + "completion_length": 3192.8194580078125, + "epoch": 0.3835616438356164, + "grad_norm": 0.12524190545082092, + "kl": 0.015838623046875, + "learning_rate": 8.102495512755938e-07, + "loss": 0.0006, + "reward": 0.16609879583120346, + "reward_std": 0.4136479049921036, + "rewards/cosine_scaled_reward": -0.18806789070367813, + "rewards/format_reward": 0.3541666716337204, + "step": 112 + }, + { + "completion_length": 3000.625, + "epoch": 0.386986301369863, + "grad_norm": 0.15665502846240997, + "kl": 0.01593017578125, + "learning_rate": 8.059605053962833e-07, + "loss": 0.0006, + "reward": 0.34966103732585907, + "reward_std": 0.5462008714675903, + "rewards/cosine_scaled_reward": -0.025338975712656975, + "rewards/format_reward": 0.375, + "step": 113 + }, + { + "completion_length": 3111.944580078125, + "epoch": 0.3904109589041096, + "grad_norm": 0.12206412851810455, + "kl": 0.0172119140625, + "learning_rate": 8.01636806561836e-07, + "loss": 0.0007, + "reward": 0.3253851607441902, + "reward_std": 0.46508027613162994, + "rewards/cosine_scaled_reward": -0.021837057545781136, + "rewards/format_reward": 0.3472222238779068, + "step": 114 + }, + { + "completion_length": 2817.8333740234375, + "epoch": 0.3938356164383562, + "grad_norm": 0.15154601633548737, + "kl": 0.01849365234375, + "learning_rate": 7.972790401318627e-07, + "loss": 0.0007, + "reward": 0.19523771665990353, + "reward_std": 0.2818225920200348, + "rewards/cosine_scaled_reward": -0.17281781509518623, + "rewards/format_reward": 0.368055559694767, + "step": 115 + }, + { + "completion_length": 3147.47216796875, + "epoch": 0.3972602739726027, + "grad_norm": 0.11532028764486313, + "kl": 0.013916015625, + "learning_rate": 7.928877960781808e-07, + "loss": 0.0006, + "reward": 0.21391746401786804, + "reward_std": 0.4905927777290344, + "rewards/cosine_scaled_reward": -0.0777492057532072, + "rewards/format_reward": 0.2916666716337204, + "step": 116 + }, + { + "completion_length": 2670.5625, + "epoch": 0.4006849315068493, + "grad_norm": 0.16773977875709534, + "kl": 0.0164794921875, + "learning_rate": 7.884636689049422e-07, + "loss": 0.0007, + "reward": 0.4450993984937668, + "reward_std": 0.4119822680950165, + "rewards/cosine_scaled_reward": -0.08962283562868834, + "rewards/format_reward": 0.5347222238779068, + "step": 117 + }, + { + "completion_length": 3084.826416015625, + "epoch": 0.4041095890410959, + "grad_norm": 0.13086926937103271, + "kl": 0.0169677734375, + "learning_rate": 7.840072575681468e-07, + "loss": 0.0007, + "reward": 0.32114290446043015, + "reward_std": 0.3112673908472061, + "rewards/cosine_scaled_reward": -0.033023773692548275, + "rewards/format_reward": 0.3541666716337204, + "step": 118 + }, + { + "completion_length": 3206.5625, + "epoch": 0.4075342465753425, + "grad_norm": 0.1529916226863861, + "kl": 0.02081298828125, + "learning_rate": 7.795191653945538e-07, + "loss": 0.0008, + "reward": 0.08010175824165344, + "reward_std": 0.2550045773386955, + "rewards/cosine_scaled_reward": -0.14906491339206696, + "rewards/format_reward": 0.2291666641831398, + "step": 119 + }, + { + "completion_length": 2953.4097900390625, + "epoch": 0.410958904109589, + "grad_norm": 0.1457367241382599, + "kl": 0.01806640625, + "learning_rate": 7.75e-07, + "loss": 0.0007, + "reward": 0.4063476175069809, + "reward_std": 0.4592422544956207, + "rewards/cosine_scaled_reward": -0.024207940325140953, + "rewards/format_reward": 0.4305555671453476, + "step": 120 + }, + { + "completion_length": 2701.7569580078125, + "epoch": 0.4143835616438356, + "grad_norm": 0.16445693373680115, + "kl": 0.0205078125, + "learning_rate": 7.704503732071391e-07, + "loss": 0.0008, + "reward": 0.5053235739469528, + "reward_std": 0.5655853599309921, + "rewards/cosine_scaled_reward": -0.015509757213294506, + "rewards/format_reward": 0.5208333432674408, + "step": 121 + }, + { + "completion_length": 3087.791748046875, + "epoch": 0.4178082191780822, + "grad_norm": 0.11365609616041183, + "kl": 0.015625, + "learning_rate": 7.658709009626109e-07, + "loss": 0.0006, + "reward": 0.6354653835296631, + "reward_std": 0.5989937484264374, + "rewards/cosine_scaled_reward": 0.16324318200349808, + "rewards/format_reward": 0.472222238779068, + "step": 122 + }, + { + "completion_length": 2501.3958740234375, + "epoch": 0.4212328767123288, + "grad_norm": 0.14529232680797577, + "kl": 0.0194091796875, + "learning_rate": 7.612622032536507e-07, + "loss": 0.0008, + "reward": 0.4810824617743492, + "reward_std": 0.4910588413476944, + "rewards/cosine_scaled_reward": -0.06058423314243555, + "rewards/format_reward": 0.5416666567325592, + "step": 123 + }, + { + "completion_length": 3121.854248046875, + "epoch": 0.4246575342465753, + "grad_norm": 0.11533054709434509, + "kl": 0.0169677734375, + "learning_rate": 7.566249040241553e-07, + "loss": 0.0007, + "reward": 0.26003750413656235, + "reward_std": 0.44601309299468994, + "rewards/cosine_scaled_reward": -0.0732958409935236, + "rewards/format_reward": 0.3333333283662796, + "step": 124 + }, + { + "completion_length": 2981.65283203125, + "epoch": 0.4280821917808219, + "grad_norm": 0.14013110101222992, + "kl": 0.0194091796875, + "learning_rate": 7.51959631090208e-07, + "loss": 0.0008, + "reward": 0.17028066888451576, + "reward_std": 0.35746677219867706, + "rewards/cosine_scaled_reward": -0.16305266320705414, + "rewards/format_reward": 0.3333333358168602, + "step": 125 + }, + { + "completion_length": 3021.8055419921875, + "epoch": 0.4315068493150685, + "grad_norm": 0.12227875739336014, + "kl": 0.01800537109375, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0007, + "reward": 0.35344888269901276, + "reward_std": 0.3790488839149475, + "rewards/cosine_scaled_reward": -0.0007177963852882385, + "rewards/format_reward": 0.3541666716337204, + "step": 126 + }, + { + "completion_length": 3308.8055419921875, + "epoch": 0.4349315068493151, + "grad_norm": 0.13628166913986206, + "kl": 0.01708984375, + "learning_rate": 7.425476942237444e-07, + "loss": 0.0007, + "reward": 0.2540893331170082, + "reward_std": 0.47945114970207214, + "rewards/cosine_scaled_reward": -0.07229956053197384, + "rewards/format_reward": 0.326388880610466, + "step": 127 + }, + { + "completion_length": 2813.6666259765625, + "epoch": 0.4383561643835616, + "grad_norm": 0.15076382458209991, + "kl": 0.019775390625, + "learning_rate": 7.37802304516818e-07, + "loss": 0.0008, + "reward": 0.47455114126205444, + "reward_std": 0.4427139610052109, + "rewards/cosine_scaled_reward": -0.03933773934841156, + "rewards/format_reward": 0.5138888955116272, + "step": 128 + }, + { + "completion_length": 2989.52783203125, + "epoch": 0.4417808219178082, + "grad_norm": 0.14209668338298798, + "kl": 0.01824951171875, + "learning_rate": 7.330314893841101e-07, + "loss": 0.0007, + "reward": 0.3041275106370449, + "reward_std": 0.4487529695034027, + "rewards/cosine_scaled_reward": -0.09865027293562889, + "rewards/format_reward": 0.4027777910232544, + "step": 129 + }, + { + "completion_length": 2724.0556640625, + "epoch": 0.4452054794520548, + "grad_norm": 0.13135673105716705, + "kl": 0.02044677734375, + "learning_rate": 7.282358947176205e-07, + "loss": 0.0008, + "reward": 0.4611601382493973, + "reward_std": 0.44737473130226135, + "rewards/cosine_scaled_reward": -0.059673219453543425, + "rewards/format_reward": 0.520833358168602, + "step": 130 + }, + { + "completion_length": 2713.166748046875, + "epoch": 0.4486301369863014, + "grad_norm": 0.14880216121673584, + "kl": 0.0208740234375, + "learning_rate": 7.234161697641017e-07, + "loss": 0.0008, + "reward": 0.5555524080991745, + "reward_std": 0.3877111077308655, + "rewards/cosine_scaled_reward": 0.05555241275578737, + "rewards/format_reward": 0.5000000149011612, + "step": 131 + }, + { + "completion_length": 3061.0625, + "epoch": 0.4520547945205479, + "grad_norm": 0.12646687030792236, + "kl": 0.02032470703125, + "learning_rate": 7.185729670371604e-07, + "loss": 0.0008, + "reward": 0.15745187550783157, + "reward_std": 0.23714770376682281, + "rewards/cosine_scaled_reward": -0.15504813194274902, + "rewards/format_reward": 0.3125, + "step": 132 + }, + { + "completion_length": 3232.3472900390625, + "epoch": 0.4554794520547945, + "grad_norm": 0.10323884338140488, + "kl": 0.0181884765625, + "learning_rate": 7.137069422289181e-07, + "loss": 0.0007, + "reward": 0.2314557433128357, + "reward_std": 0.39915989339351654, + "rewards/cosine_scaled_reward": -0.08104425063356757, + "rewards/format_reward": 0.3125000149011612, + "step": 133 + }, + { + "completion_length": 3193.8612060546875, + "epoch": 0.4589041095890411, + "grad_norm": 0.1457975059747696, + "kl": 0.02178955078125, + "learning_rate": 7.08818754121241e-07, + "loss": 0.0009, + "reward": 0.03552616201341152, + "reward_std": 0.36521604657173157, + "rewards/cosine_scaled_reward": -0.2630849555134773, + "rewards/format_reward": 0.298611119389534, + "step": 134 + }, + { + "completion_length": 3036.7291259765625, + "epoch": 0.4623287671232877, + "grad_norm": 0.12449899315834045, + "kl": 0.01971435546875, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0008, + "reward": 0.6097220778465271, + "reward_std": 0.6045728027820587, + "rewards/cosine_scaled_reward": 0.10972205176949501, + "rewards/format_reward": 0.5000000149011612, + "step": 135 + }, + { + "completion_length": 2624.75, + "epoch": 0.4657534246575342, + "grad_norm": 0.15769356489181519, + "kl": 0.02471923828125, + "learning_rate": 6.989785380482312e-07, + "loss": 0.001, + "reward": 0.4681246876716614, + "reward_std": 0.3646779954433441, + "rewards/cosine_scaled_reward": -0.052708632312715054, + "rewards/format_reward": 0.520833358168602, + "step": 136 + }, + { + "completion_length": 2946.8333740234375, + "epoch": 0.4691780821917808, + "grad_norm": 0.13552969694137573, + "kl": 0.02203369140625, + "learning_rate": 6.940278422906372e-07, + "loss": 0.0009, + "reward": 0.4625835418701172, + "reward_std": 0.4559210389852524, + "rewards/cosine_scaled_reward": 0.004250235855579376, + "rewards/format_reward": 0.4583333283662796, + "step": 137 + }, + { + "completion_length": 3224.576416015625, + "epoch": 0.4726027397260274, + "grad_norm": 0.1137109324336052, + "kl": 0.01885986328125, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0008, + "reward": 0.27407026663422585, + "reward_std": 0.42793411016464233, + "rewards/cosine_scaled_reward": -0.03148530051112175, + "rewards/format_reward": 0.305555559694767, + "step": 138 + }, + { + "completion_length": 3156.84033203125, + "epoch": 0.476027397260274, + "grad_norm": 0.13348160684108734, + "kl": 0.01849365234375, + "learning_rate": 6.840686264673168e-07, + "loss": 0.0007, + "reward": 0.2564939334988594, + "reward_std": 0.4258010536432266, + "rewards/cosine_scaled_reward": -0.06295053288340569, + "rewards/format_reward": 0.3194444477558136, + "step": 139 + }, + { + "completion_length": 3052.3125, + "epoch": 0.4794520547945205, + "grad_norm": 0.1428801566362381, + "kl": 0.021240234375, + "learning_rate": 6.790614547199906e-07, + "loss": 0.0009, + "reward": 0.3512444347143173, + "reward_std": 0.404025673866272, + "rewards/cosine_scaled_reward": -0.016811135224997997, + "rewards/format_reward": 0.3680555671453476, + "step": 140 + }, + { + "completion_length": 2792.486083984375, + "epoch": 0.4828767123287671, + "grad_norm": 0.13267238438129425, + "kl": 0.02069091796875, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0008, + "reward": 0.6453568935394287, + "reward_std": 0.5250414609909058, + "rewards/cosine_scaled_reward": 0.11063468037173152, + "rewards/format_reward": 0.5347222238779068, + "step": 141 + }, + { + "completion_length": 2989.0972900390625, + "epoch": 0.4863013698630137, + "grad_norm": 0.12677060067653656, + "kl": 0.021484375, + "learning_rate": 6.68995372916741e-07, + "loss": 0.0009, + "reward": 0.4786522537469864, + "reward_std": 0.5605998337268829, + "rewards/cosine_scaled_reward": 0.013374458998441696, + "rewards/format_reward": 0.4652777761220932, + "step": 142 + }, + { + "completion_length": 3084.6180419921875, + "epoch": 0.4897260273972603, + "grad_norm": 0.12558940052986145, + "kl": 0.0198974609375, + "learning_rate": 6.639378256471608e-07, + "loss": 0.0008, + "reward": 0.3289758712053299, + "reward_std": 0.45005667209625244, + "rewards/cosine_scaled_reward": -0.018246358260512352, + "rewards/format_reward": 0.3472222238779068, + "step": 143 + }, + { + "completion_length": 3120.8541259765625, + "epoch": 0.4931506849315068, + "grad_norm": 0.13771818578243256, + "kl": 0.02197265625, + "learning_rate": 6.588648530198504e-07, + "loss": 0.0009, + "reward": 0.1804770578892203, + "reward_std": 0.40222403407096863, + "rewards/cosine_scaled_reward": -0.11813406273722649, + "rewards/format_reward": 0.2986111119389534, + "step": 144 + }, + { + "completion_length": 3007.013916015625, + "epoch": 0.4965753424657534, + "grad_norm": 0.19792306423187256, + "kl": 0.031005859375, + "learning_rate": 6.537771418340981e-07, + "loss": 0.0012, + "reward": 0.19204921275377274, + "reward_std": 0.30794692039489746, + "rewards/cosine_scaled_reward": -0.09961747378110886, + "rewards/format_reward": 0.2916666567325592, + "step": 145 + }, + { + "completion_length": 3020.9306640625, + "epoch": 0.5, + "grad_norm": 0.13727493584156036, + "kl": 0.0220947265625, + "learning_rate": 6.486753808845564e-07, + "loss": 0.0009, + "reward": 0.16832835972309113, + "reward_std": 0.5160266309976578, + "rewards/cosine_scaled_reward": -0.15806053578853607, + "rewards/format_reward": 0.3263888955116272, + "step": 146 + }, + { + "completion_length": 2718.4097900390625, + "epoch": 0.5034246575342466, + "grad_norm": 0.11808720976114273, + "kl": 0.021484375, + "learning_rate": 6.435602608679916e-07, + "loss": 0.0009, + "reward": 0.5802240371704102, + "reward_std": 0.44783809781074524, + "rewards/cosine_scaled_reward": 0.017724037170410156, + "rewards/format_reward": 0.5625000149011612, + "step": 147 + }, + { + "completion_length": 2859.6458740234375, + "epoch": 0.5068493150684932, + "grad_norm": 0.21733467280864716, + "kl": 0.03009033203125, + "learning_rate": 6.384324742897735e-07, + "loss": 0.0012, + "reward": 0.4155062139034271, + "reward_std": 0.4282771795988083, + "rewards/cosine_scaled_reward": 0.005784010514616966, + "rewards/format_reward": 0.4097222238779068, + "step": 148 + }, + { + "completion_length": 2950.701416015625, + "epoch": 0.5102739726027398, + "grad_norm": 0.10564743727445602, + "kl": 0.02130126953125, + "learning_rate": 6.332927153701215e-07, + "loss": 0.0009, + "reward": 0.6034330129623413, + "reward_std": 0.47583654522895813, + "rewards/cosine_scaled_reward": 0.1103774681687355, + "rewards/format_reward": 0.4930555671453476, + "step": 149 + }, + { + "completion_length": 3038.013916015625, + "epoch": 0.5136986301369864, + "grad_norm": 0.12356794625520706, + "kl": 0.02337646484375, + "learning_rate": 6.281416799501187e-07, + "loss": 0.0009, + "reward": 0.577631801366806, + "reward_std": 0.47365154325962067, + "rewards/cosine_scaled_reward": 0.13318736106157303, + "rewards/format_reward": 0.4444444477558136, + "step": 150 + }, + { + "completion_length": 3258.90966796875, + "epoch": 0.5171232876712328, + "grad_norm": 0.13418567180633545, + "kl": 0.0211181640625, + "learning_rate": 6.229800653975054e-07, + "loss": 0.0008, + "reward": 0.3129318729043007, + "reward_std": 0.5764759629964828, + "rewards/cosine_scaled_reward": -0.0551237054169178, + "rewards/format_reward": 0.3680555671453476, + "step": 151 + }, + { + "completion_length": 2671.361083984375, + "epoch": 0.5205479452054794, + "grad_norm": 0.1966993510723114, + "kl": 0.0272216796875, + "learning_rate": 6.178085705122674e-07, + "loss": 0.0011, + "reward": 0.5614952445030212, + "reward_std": 0.43348051607608795, + "rewards/cosine_scaled_reward": 0.040661935694515705, + "rewards/format_reward": 0.5208333432674408, + "step": 152 + }, + { + "completion_length": 3149.1666259765625, + "epoch": 0.523972602739726, + "grad_norm": 0.12545958161354065, + "kl": 0.0216064453125, + "learning_rate": 6.126278954320294e-07, + "loss": 0.0009, + "reward": 0.15634628385305405, + "reward_std": 0.38860486447811127, + "rewards/cosine_scaled_reward": -0.17698704451322556, + "rewards/format_reward": 0.3333333432674408, + "step": 153 + }, + { + "completion_length": 3038.9930419921875, + "epoch": 0.5273972602739726, + "grad_norm": 0.15269039571285248, + "kl": 0.0208740234375, + "learning_rate": 6.074387415372676e-07, + "loss": 0.0008, + "reward": 0.38375401496887207, + "reward_std": 0.48632751405239105, + "rewards/cosine_scaled_reward": -0.019023781642317772, + "rewards/format_reward": 0.4027777761220932, + "step": 154 + }, + { + "completion_length": 3070.8680419921875, + "epoch": 0.5308219178082192, + "grad_norm": 0.11029084771871567, + "kl": 0.02392578125, + "learning_rate": 6.022418113563535e-07, + "loss": 0.001, + "reward": 0.26838141679763794, + "reward_std": 0.3721802681684494, + "rewards/cosine_scaled_reward": -0.07189634721726179, + "rewards/format_reward": 0.3402777910232544, + "step": 155 + }, + { + "completion_length": 3024.270751953125, + "epoch": 0.5342465753424658, + "grad_norm": 0.1171901524066925, + "kl": 0.02239990234375, + "learning_rate": 5.97037808470444e-07, + "loss": 0.0009, + "reward": 0.36915363371372223, + "reward_std": 0.40745308995246887, + "rewards/cosine_scaled_reward": 0.021931427530944347, + "rewards/format_reward": 0.3472222238779068, + "step": 156 + }, + { + "completion_length": 2957.673583984375, + "epoch": 0.5376712328767124, + "grad_norm": 0.14073632657527924, + "kl": 0.0245361328125, + "learning_rate": 5.918274374182266e-07, + "loss": 0.001, + "reward": 0.18879153579473495, + "reward_std": 0.31644490361213684, + "rewards/cosine_scaled_reward": -0.14454180747270584, + "rewards/format_reward": 0.3333333432674408, + "step": 157 + }, + { + "completion_length": 2745.4306640625, + "epoch": 0.541095890410959, + "grad_norm": 0.15367640554904938, + "kl": 0.02471923828125, + "learning_rate": 5.866114036005362e-07, + "loss": 0.001, + "reward": 0.647605836391449, + "reward_std": 0.45094963908195496, + "rewards/cosine_scaled_reward": 0.12677249684929848, + "rewards/format_reward": 0.5208333432674408, + "step": 158 + }, + { + "completion_length": 2848.479248046875, + "epoch": 0.5445205479452054, + "grad_norm": 0.14879484474658966, + "kl": 0.026123046875, + "learning_rate": 5.813904131848564e-07, + "loss": 0.001, + "reward": 0.31949036195874214, + "reward_std": 0.43667902052402496, + "rewards/cosine_scaled_reward": -0.10412076953798532, + "rewards/format_reward": 0.423611119389534, + "step": 159 + }, + { + "completion_length": 2885.5416259765625, + "epoch": 0.547945205479452, + "grad_norm": 0.12330173701047897, + "kl": 0.02471923828125, + "learning_rate": 5.761651730097142e-07, + "loss": 0.001, + "reward": 0.47639837861061096, + "reward_std": 0.5159202069044113, + "rewards/cosine_scaled_reward": -0.00276830792427063, + "rewards/format_reward": 0.4791666716337204, + "step": 160 + }, + { + "completion_length": 3260.416748046875, + "epoch": 0.5513698630136986, + "grad_norm": 0.1258607804775238, + "kl": 0.02301025390625, + "learning_rate": 5.709363904889861e-07, + "loss": 0.0009, + "reward": 0.3024430572986603, + "reward_std": 0.6272812485694885, + "rewards/cosine_scaled_reward": -0.01700139231979847, + "rewards/format_reward": 0.3194444477558136, + "step": 161 + }, + { + "completion_length": 3207.2083740234375, + "epoch": 0.5547945205479452, + "grad_norm": 0.12467379122972488, + "kl": 0.02398681640625, + "learning_rate": 5.657047735161255e-07, + "loss": 0.001, + "reward": 0.20557049103081226, + "reward_std": 0.3906491547822952, + "rewards/cosine_scaled_reward": -0.07220727764070034, + "rewards/format_reward": 0.2777777761220932, + "step": 162 + }, + { + "completion_length": 2876.319580078125, + "epoch": 0.5582191780821918, + "grad_norm": 0.13523170351982117, + "kl": 0.02081298828125, + "learning_rate": 5.604710303683253e-07, + "loss": 0.0008, + "reward": 0.5364240109920502, + "reward_std": 0.48902808129787445, + "rewards/cosine_scaled_reward": 0.06420179456472397, + "rewards/format_reward": 0.4722222238779068, + "step": 163 + }, + { + "completion_length": 2883.8958740234375, + "epoch": 0.5616438356164384, + "grad_norm": 0.17181335389614105, + "kl": 0.028564453125, + "learning_rate": 5.552358696106288e-07, + "loss": 0.0011, + "reward": 0.27160534262657166, + "reward_std": 0.360342800617218, + "rewards/cosine_scaled_reward": -0.10339467972517014, + "rewards/format_reward": 0.375, + "step": 164 + }, + { + "completion_length": 3007.28466796875, + "epoch": 0.565068493150685, + "grad_norm": 0.12402219325304031, + "kl": 0.02569580078125, + "learning_rate": 5.5e-07, + "loss": 0.001, + "reward": 0.4214746206998825, + "reward_std": 0.4252837300300598, + "rewards/cosine_scaled_reward": 0.004807952791452408, + "rewards/format_reward": 0.4166666716337204, + "step": 165 + }, + { + "completion_length": 2990.2291259765625, + "epoch": 0.5684931506849316, + "grad_norm": 0.1333877593278885, + "kl": 0.02728271484375, + "learning_rate": 5.447641303893714e-07, + "loss": 0.0011, + "reward": 0.3566634953022003, + "reward_std": 0.49885208904743195, + "rewards/cosine_scaled_reward": -0.05305874161422253, + "rewards/format_reward": 0.4097222238779068, + "step": 166 + }, + { + "completion_length": 3205.513916015625, + "epoch": 0.571917808219178, + "grad_norm": 0.14048750698566437, + "kl": 0.02264404296875, + "learning_rate": 5.395289696316747e-07, + "loss": 0.0009, + "reward": 0.29908060282468796, + "reward_std": 0.5275179445743561, + "rewards/cosine_scaled_reward": -0.07591940555721521, + "rewards/format_reward": 0.3750000149011612, + "step": 167 + }, + { + "completion_length": 2930.826416015625, + "epoch": 0.5753424657534246, + "grad_norm": 0.13344134390354156, + "kl": 0.0302734375, + "learning_rate": 5.342952264838747e-07, + "loss": 0.0012, + "reward": 0.41841088235378265, + "reward_std": 0.40377357602119446, + "rewards/cosine_scaled_reward": -0.05381134897470474, + "rewards/format_reward": 0.4722222238779068, + "step": 168 + }, + { + "completion_length": 2789.326416015625, + "epoch": 0.5787671232876712, + "grad_norm": 0.1511034071445465, + "kl": 0.02587890625, + "learning_rate": 5.29063609511014e-07, + "loss": 0.001, + "reward": 0.45637938380241394, + "reward_std": 0.4706819951534271, + "rewards/cosine_scaled_reward": -0.00889836996793747, + "rewards/format_reward": 0.4652777761220932, + "step": 169 + }, + { + "completion_length": 3059.9930419921875, + "epoch": 0.5821917808219178, + "grad_norm": 0.14896270632743835, + "kl": 0.02703857421875, + "learning_rate": 5.238348269902859e-07, + "loss": 0.0011, + "reward": 0.18200979381799698, + "reward_std": 0.4613404721021652, + "rewards/cosine_scaled_reward": -0.19299022108316422, + "rewards/format_reward": 0.375, + "step": 170 + }, + { + "completion_length": 3044.4930419921875, + "epoch": 0.5856164383561644, + "grad_norm": 0.12685967981815338, + "kl": 0.02545166015625, + "learning_rate": 5.186095868151436e-07, + "loss": 0.001, + "reward": 0.24055355973541737, + "reward_std": 0.4408658444881439, + "rewards/cosine_scaled_reward": -0.14139089360833168, + "rewards/format_reward": 0.3819444477558136, + "step": 171 + }, + { + "completion_length": 2786.451416015625, + "epoch": 0.589041095890411, + "grad_norm": 0.14915940165519714, + "kl": 0.02362060546875, + "learning_rate": 5.133885963994639e-07, + "loss": 0.0009, + "reward": 0.23196261376142502, + "reward_std": 0.30038829147815704, + "rewards/cosine_scaled_reward": -0.14303740486502647, + "rewards/format_reward": 0.375, + "step": 172 + }, + { + "completion_length": 3067.9306640625, + "epoch": 0.5924657534246576, + "grad_norm": 0.10505218058824539, + "kl": 0.0228271484375, + "learning_rate": 5.081725625817735e-07, + "loss": 0.0009, + "reward": 0.5079791992902756, + "reward_std": 0.5481714606285095, + "rewards/cosine_scaled_reward": 0.05659032240509987, + "rewards/format_reward": 0.451388880610466, + "step": 173 + }, + { + "completion_length": 2666.3541259765625, + "epoch": 0.5958904109589042, + "grad_norm": 0.135061576962471, + "kl": 0.02471923828125, + "learning_rate": 5.02962191529556e-07, + "loss": 0.001, + "reward": 0.4175218790769577, + "reward_std": 0.37634842097759247, + "rewards/cosine_scaled_reward": -0.1033114567399025, + "rewards/format_reward": 0.5208333283662796, + "step": 174 + }, + { + "completion_length": 3136.0833740234375, + "epoch": 0.5993150684931506, + "grad_norm": 0.11110376566648483, + "kl": 0.026123046875, + "learning_rate": 4.977581886436462e-07, + "loss": 0.001, + "reward": 0.4329180419445038, + "reward_std": 0.5168599039316177, + "rewards/cosine_scaled_reward": -0.025415293872356415, + "rewards/format_reward": 0.4583333283662796, + "step": 175 + }, + { + "completion_length": 3090.9583740234375, + "epoch": 0.6027397260273972, + "grad_norm": 0.12515892088413239, + "kl": 0.02777099609375, + "learning_rate": 4.925612584627324e-07, + "loss": 0.0011, + "reward": 0.3180784285068512, + "reward_std": 0.4640498459339142, + "rewards/cosine_scaled_reward": -0.0985882543027401, + "rewards/format_reward": 0.4166666716337204, + "step": 176 + }, + { + "completion_length": 3099.416748046875, + "epoch": 0.6061643835616438, + "grad_norm": 0.10670111328363419, + "kl": 0.027587890625, + "learning_rate": 4.873721045679706e-07, + "loss": 0.0011, + "reward": 0.37505099177360535, + "reward_std": 0.31386855244636536, + "rewards/cosine_scaled_reward": -0.013837885111570358, + "rewards/format_reward": 0.3888889029622078, + "step": 177 + }, + { + "completion_length": 2943.9306640625, + "epoch": 0.6095890410958904, + "grad_norm": 0.13467499613761902, + "kl": 0.0272216796875, + "learning_rate": 4.821914294877326e-07, + "loss": 0.0011, + "reward": 0.2806200385093689, + "reward_std": 0.48631517589092255, + "rewards/cosine_scaled_reward": -0.1152133010327816, + "rewards/format_reward": 0.3958333283662796, + "step": 178 + }, + { + "completion_length": 3079.7222900390625, + "epoch": 0.613013698630137, + "grad_norm": 0.1212821900844574, + "kl": 0.02349853515625, + "learning_rate": 4.770199346024947e-07, + "loss": 0.0009, + "reward": 0.34058963507413864, + "reward_std": 0.5203238129615784, + "rewards/cosine_scaled_reward": -0.03441034443676472, + "rewards/format_reward": 0.375, + "step": 179 + }, + { + "completion_length": 2798.0555419921875, + "epoch": 0.6164383561643836, + "grad_norm": 0.14347128570079803, + "kl": 0.02813720703125, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.0011, + "reward": 0.33986398577690125, + "reward_std": 0.4621936082839966, + "rewards/cosine_scaled_reward": -0.12541379779577255, + "rewards/format_reward": 0.4652777761220932, + "step": 180 + }, + { + "completion_length": 3015.9791259765625, + "epoch": 0.6198630136986302, + "grad_norm": 0.10863691568374634, + "kl": 0.0252685546875, + "learning_rate": 4.667072846298785e-07, + "loss": 0.001, + "reward": 0.32850363850593567, + "reward_std": 0.4628835916519165, + "rewards/cosine_scaled_reward": -0.10205190535634756, + "rewards/format_reward": 0.4305555671453476, + "step": 181 + }, + { + "completion_length": 3229.4375, + "epoch": 0.6232876712328768, + "grad_norm": 0.13130250573158264, + "kl": 0.02691650390625, + "learning_rate": 4.6156752571022637e-07, + "loss": 0.0011, + "reward": 0.1762874647974968, + "reward_std": 0.45904412865638733, + "rewards/cosine_scaled_reward": -0.1084347516298294, + "rewards/format_reward": 0.2847222238779068, + "step": 182 + }, + { + "completion_length": 3043.9722900390625, + "epoch": 0.6267123287671232, + "grad_norm": 0.13968031108379364, + "kl": 0.02679443359375, + "learning_rate": 4.5643973913200837e-07, + "loss": 0.0011, + "reward": 0.3731088787317276, + "reward_std": 0.5312269479036331, + "rewards/cosine_scaled_reward": -0.029668924515135586, + "rewards/format_reward": 0.4027777910232544, + "step": 183 + }, + { + "completion_length": 3144.5625, + "epoch": 0.6301369863013698, + "grad_norm": 0.15973497927188873, + "kl": 0.03131103515625, + "learning_rate": 4.513246191154434e-07, + "loss": 0.0013, + "reward": 0.2171999216079712, + "reward_std": 0.43705086410045624, + "rewards/cosine_scaled_reward": -0.12307787034660578, + "rewards/format_reward": 0.3402777910232544, + "step": 184 + }, + { + "completion_length": 2747.4652099609375, + "epoch": 0.6335616438356164, + "grad_norm": 0.13849014043807983, + "kl": 0.02557373046875, + "learning_rate": 4.4622285816590186e-07, + "loss": 0.001, + "reward": 0.5670523345470428, + "reward_std": 0.41843467950820923, + "rewards/cosine_scaled_reward": 0.07399673759937286, + "rewards/format_reward": 0.4930555671453476, + "step": 185 + }, + { + "completion_length": 2985.71533203125, + "epoch": 0.636986301369863, + "grad_norm": 0.15879811346530914, + "kl": 0.0255126953125, + "learning_rate": 4.4113514698014953e-07, + "loss": 0.001, + "reward": 0.4425032064318657, + "reward_std": 0.6330223977565765, + "rewards/cosine_scaled_reward": -0.036663462640717626, + "rewards/format_reward": 0.4791666716337204, + "step": 186 + }, + { + "completion_length": 2936.0416259765625, + "epoch": 0.6404109589041096, + "grad_norm": 0.12994582951068878, + "kl": 0.0272216796875, + "learning_rate": 4.360621743528392e-07, + "loss": 0.0011, + "reward": 0.34703654050827026, + "reward_std": 0.4127475470304489, + "rewards/cosine_scaled_reward": -0.11824123747646809, + "rewards/format_reward": 0.4652777761220932, + "step": 187 + }, + { + "completion_length": 2758.3126220703125, + "epoch": 0.6438356164383562, + "grad_norm": 0.17351487278938293, + "kl": 0.03253173828125, + "learning_rate": 4.3100462708325914e-07, + "loss": 0.0013, + "reward": 0.3991873562335968, + "reward_std": 0.5110540986061096, + "rewards/cosine_scaled_reward": -0.07303486485034227, + "rewards/format_reward": 0.4722222238779068, + "step": 188 + }, + { + "completion_length": 3067.34033203125, + "epoch": 0.6472602739726028, + "grad_norm": 0.17369891703128815, + "kl": 0.02911376953125, + "learning_rate": 4.2596318988235037e-07, + "loss": 0.0012, + "reward": 0.3477390259504318, + "reward_std": 0.5584754794836044, + "rewards/cosine_scaled_reward": -0.05503876507282257, + "rewards/format_reward": 0.4027777910232544, + "step": 189 + }, + { + "completion_length": 2643.451416015625, + "epoch": 0.6506849315068494, + "grad_norm": 0.15890643000602722, + "kl": 0.0362548828125, + "learning_rate": 4.209385452800095e-07, + "loss": 0.0015, + "reward": 0.3941483050584793, + "reward_std": 0.4240891933441162, + "rewards/cosine_scaled_reward": -0.05724058859050274, + "rewards/format_reward": 0.4513888955116272, + "step": 190 + }, + { + "completion_length": 3008.076416015625, + "epoch": 0.6541095890410958, + "grad_norm": 0.14325085282325745, + "kl": 0.02728271484375, + "learning_rate": 4.1593137353268303e-07, + "loss": 0.0011, + "reward": 0.4985590726137161, + "reward_std": 0.573657751083374, + "rewards/cosine_scaled_reward": 0.04717019200325012, + "rewards/format_reward": 0.4513888955116272, + "step": 191 + }, + { + "completion_length": 2820.451416015625, + "epoch": 0.6575342465753424, + "grad_norm": 0.14183150231838226, + "kl": 0.0311279296875, + "learning_rate": 4.1094235253127374e-07, + "loss": 0.0012, + "reward": 0.2823975309729576, + "reward_std": 0.4726349860429764, + "rewards/cosine_scaled_reward": -0.15510250255465508, + "rewards/format_reward": 0.4375, + "step": 192 + }, + { + "completion_length": 3112.888916015625, + "epoch": 0.660958904109589, + "grad_norm": 0.1434364914894104, + "kl": 0.0272216796875, + "learning_rate": 4.059721577093628e-07, + "loss": 0.0011, + "reward": 0.2338184304535389, + "reward_std": 0.513207420706749, + "rewards/cosine_scaled_reward": -0.0925704650580883, + "rewards/format_reward": 0.3263888955116272, + "step": 193 + }, + { + "completion_length": 2760.3958740234375, + "epoch": 0.6643835616438356, + "grad_norm": 0.15593542158603668, + "kl": 0.02716064453125, + "learning_rate": 4.0102146195176887e-07, + "loss": 0.0011, + "reward": 0.39227430522441864, + "reward_std": 0.431293249130249, + "rewards/cosine_scaled_reward": -0.05911456607282162, + "rewards/format_reward": 0.4513888955116272, + "step": 194 + }, + { + "completion_length": 3120.576416015625, + "epoch": 0.6678082191780822, + "grad_norm": 0.12067221850156784, + "kl": 0.02813720703125, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0011, + "reward": 0.33260975778102875, + "reward_std": 0.3859928399324417, + "rewards/cosine_scaled_reward": -0.014612471219152212, + "rewards/format_reward": 0.3472222238779068, + "step": 195 + }, + { + "completion_length": 2929.52783203125, + "epoch": 0.6712328767123288, + "grad_norm": 0.15069261193275452, + "kl": 0.0296630859375, + "learning_rate": 3.911812458787591e-07, + "loss": 0.0012, + "reward": 0.3924526572227478, + "reward_std": 0.48095013201236725, + "rewards/cosine_scaled_reward": -0.051991806365549564, + "rewards/format_reward": 0.4444444477558136, + "step": 196 + }, + { + "completion_length": 2846.9375, + "epoch": 0.6746575342465754, + "grad_norm": 0.1341117024421692, + "kl": 0.0289306640625, + "learning_rate": 3.86293057771082e-07, + "loss": 0.0012, + "reward": 0.5837523490190506, + "reward_std": 0.5863839089870453, + "rewards/cosine_scaled_reward": 0.07680792175233364, + "rewards/format_reward": 0.5069444328546524, + "step": 197 + }, + { + "completion_length": 2936.75, + "epoch": 0.678082191780822, + "grad_norm": 0.12375517934560776, + "kl": 0.02490234375, + "learning_rate": 3.8142703296283953e-07, + "loss": 0.001, + "reward": 0.421320840716362, + "reward_std": 0.48637837171554565, + "rewards/cosine_scaled_reward": 0.018543066456913948, + "rewards/format_reward": 0.4027777761220932, + "step": 198 + }, + { + "completion_length": 3141.076416015625, + "epoch": 0.6815068493150684, + "grad_norm": 0.14611783623695374, + "kl": 0.02886962890625, + "learning_rate": 3.7658383023589833e-07, + "loss": 0.0012, + "reward": 0.31543052941560745, + "reward_std": 0.484183669090271, + "rewards/cosine_scaled_reward": -0.059569500386714935, + "rewards/format_reward": 0.375, + "step": 199 + }, + { + "completion_length": 2385.8680419921875, + "epoch": 0.684931506849315, + "grad_norm": 0.1563843935728073, + "kl": 0.02777099609375, + "learning_rate": 3.7176410528237945e-07, + "loss": 0.0011, + "reward": 0.546364039182663, + "reward_std": 0.4009109437465668, + "rewards/cosine_scaled_reward": -0.023080429062247276, + "rewards/format_reward": 0.5694444626569748, + "step": 200 + }, + { + "completion_length": 3089.6458740234375, + "epoch": 0.6883561643835616, + "grad_norm": 0.12665298581123352, + "kl": 0.03118896484375, + "learning_rate": 3.6696851061588994e-07, + "loss": 0.0012, + "reward": 0.17153404070995748, + "reward_std": 0.39160603284835815, + "rewards/cosine_scaled_reward": -0.18263262510299683, + "rewards/format_reward": 0.354166679084301, + "step": 201 + }, + { + "completion_length": 2912.826416015625, + "epoch": 0.6917808219178082, + "grad_norm": 0.1533837616443634, + "kl": 0.0333251953125, + "learning_rate": 3.62197695483182e-07, + "loss": 0.0013, + "reward": 0.4708049148321152, + "reward_std": 0.5301374197006226, + "rewards/cosine_scaled_reward": -0.01530623622238636, + "rewards/format_reward": 0.486111119389534, + "step": 202 + }, + { + "completion_length": 2903.9652099609375, + "epoch": 0.6952054794520548, + "grad_norm": 0.1623891443014145, + "kl": 0.03265380859375, + "learning_rate": 3.5745230577625573e-07, + "loss": 0.0013, + "reward": 0.3767779842019081, + "reward_std": 0.37230053544044495, + "rewards/cosine_scaled_reward": -0.025999773293733597, + "rewards/format_reward": 0.4027777761220932, + "step": 203 + }, + { + "completion_length": 2757.076416015625, + "epoch": 0.6986301369863014, + "grad_norm": 0.9816704988479614, + "kl": 0.03271484375, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0013, + "reward": 0.44859715551137924, + "reward_std": 0.3788439631462097, + "rewards/cosine_scaled_reward": 0.004152711480855942, + "rewards/format_reward": 0.4444444626569748, + "step": 204 + }, + { + "completion_length": 3004.104248046875, + "epoch": 0.702054794520548, + "grad_norm": 0.14980177581310272, + "kl": 0.0347900390625, + "learning_rate": 3.4804036890979205e-07, + "loss": 0.0014, + "reward": 0.26160044223070145, + "reward_std": 0.4303555190563202, + "rewards/cosine_scaled_reward": -0.0647884514182806, + "rewards/format_reward": 0.3263888955116272, + "step": 205 + }, + { + "completion_length": 2953.40283203125, + "epoch": 0.7054794520547946, + "grad_norm": 0.13833436369895935, + "kl": 0.031982421875, + "learning_rate": 3.433750959758446e-07, + "loss": 0.0013, + "reward": 0.3072604089975357, + "reward_std": 0.5101044028997421, + "rewards/cosine_scaled_reward": -0.12329516559839249, + "rewards/format_reward": 0.4305555522441864, + "step": 206 + }, + { + "completion_length": 2859.96533203125, + "epoch": 0.708904109589041, + "grad_norm": 0.13901107013225555, + "kl": 0.03009033203125, + "learning_rate": 3.387377967463493e-07, + "loss": 0.0012, + "reward": 0.5190957188606262, + "reward_std": 0.4833519160747528, + "rewards/cosine_scaled_reward": 0.07465130463242531, + "rewards/format_reward": 0.4444444626569748, + "step": 207 + }, + { + "completion_length": 2990.9583740234375, + "epoch": 0.7123287671232876, + "grad_norm": 0.16973043978214264, + "kl": 0.0328369140625, + "learning_rate": 3.3412909903738936e-07, + "loss": 0.0013, + "reward": 0.1571333408355713, + "reward_std": 0.33002787828445435, + "rewards/cosine_scaled_reward": -0.13453333638608456, + "rewards/format_reward": 0.2916666716337204, + "step": 208 + }, + { + "completion_length": 2839.3126220703125, + "epoch": 0.7157534246575342, + "grad_norm": 0.13653664290905, + "kl": 0.03125, + "learning_rate": 3.295496267928609e-07, + "loss": 0.0013, + "reward": 0.3505253791809082, + "reward_std": 0.47255241870880127, + "rewards/cosine_scaled_reward": -0.1008635088801384, + "rewards/format_reward": 0.451388880610466, + "step": 209 + }, + { + "completion_length": 2634.1737060546875, + "epoch": 0.7191780821917808, + "grad_norm": 0.16086305677890778, + "kl": 0.02947998046875, + "learning_rate": 3.250000000000001e-07, + "loss": 0.0012, + "reward": 0.4823562502861023, + "reward_std": 0.4473782926797867, + "rewards/cosine_scaled_reward": -0.04542151384521276, + "rewards/format_reward": 0.5277777910232544, + "step": 210 + }, + { + "completion_length": 2892.3958740234375, + "epoch": 0.7226027397260274, + "grad_norm": 0.13830755650997162, + "kl": 0.03363037109375, + "learning_rate": 3.204808346054461e-07, + "loss": 0.0013, + "reward": 0.4033074826002121, + "reward_std": 0.41210463643074036, + "rewards/cosine_scaled_reward": -0.10363698564469814, + "rewards/format_reward": 0.5069444477558136, + "step": 211 + }, + { + "completion_length": 2885.541748046875, + "epoch": 0.726027397260274, + "grad_norm": 0.1686064600944519, + "kl": 0.03106689453125, + "learning_rate": 3.159927424318531e-07, + "loss": 0.0012, + "reward": 0.3951975554227829, + "reward_std": 0.5239757895469666, + "rewards/cosine_scaled_reward": -0.04924688953906298, + "rewards/format_reward": 0.4444444477558136, + "step": 212 + }, + { + "completion_length": 2877.71533203125, + "epoch": 0.7294520547945206, + "grad_norm": 0.15673589706420898, + "kl": 0.034423828125, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0014, + "reward": 0.2623257301747799, + "reward_std": 0.3177921772003174, + "rewards/cosine_scaled_reward": -0.08489650301635265, + "rewards/format_reward": 0.3472222164273262, + "step": 213 + }, + { + "completion_length": 3026.298583984375, + "epoch": 0.7328767123287672, + "grad_norm": 0.1316094845533371, + "kl": 0.0333251953125, + "learning_rate": 3.0711220392181934e-07, + "loss": 0.0013, + "reward": 0.4284323900938034, + "reward_std": 0.5533152520656586, + "rewards/cosine_scaled_reward": -0.009067630395293236, + "rewards/format_reward": 0.4375, + "step": 214 + }, + { + "completion_length": 2914.27783203125, + "epoch": 0.7363013698630136, + "grad_norm": 0.1542029231786728, + "kl": 0.02874755859375, + "learning_rate": 3.027209598681373e-07, + "loss": 0.0011, + "reward": 0.3265261799097061, + "reward_std": 0.4032795578241348, + "rewards/cosine_scaled_reward": -0.06930714938789606, + "rewards/format_reward": 0.3958333432674408, + "step": 215 + }, + { + "completion_length": 2932.25, + "epoch": 0.7397260273972602, + "grad_norm": 0.142373189330101, + "kl": 0.03369140625, + "learning_rate": 2.9836319343816397e-07, + "loss": 0.0013, + "reward": 0.5414205491542816, + "reward_std": 0.5791518688201904, + "rewards/cosine_scaled_reward": 0.013642808422446251, + "rewards/format_reward": 0.5277777761220932, + "step": 216 + }, + { + "completion_length": 2438.7987060546875, + "epoch": 0.7431506849315068, + "grad_norm": 0.15347974002361298, + "kl": 0.03460693359375, + "learning_rate": 2.9403949460371677e-07, + "loss": 0.0014, + "reward": 0.6215761005878448, + "reward_std": 0.360453262925148, + "rewards/cosine_scaled_reward": 0.0312983263283968, + "rewards/format_reward": 0.5902777910232544, + "step": 217 + }, + { + "completion_length": 2892.2916259765625, + "epoch": 0.7465753424657534, + "grad_norm": 0.14735041558742523, + "kl": 0.0301513671875, + "learning_rate": 2.897504487244061e-07, + "loss": 0.0012, + "reward": 0.46117305755615234, + "reward_std": 0.4907253533601761, + "rewards/cosine_scaled_reward": 0.030617523938417435, + "rewards/format_reward": 0.4305555522441864, + "step": 218 + }, + { + "completion_length": 2625.59033203125, + "epoch": 0.75, + "grad_norm": 0.30190229415893555, + "kl": 0.0400390625, + "learning_rate": 2.854966364683872e-07, + "loss": 0.0016, + "reward": 0.4576456546783447, + "reward_std": 0.3912748843431473, + "rewards/cosine_scaled_reward": -0.014576543122529984, + "rewards/format_reward": 0.472222238779068, + "step": 219 + }, + { + "completion_length": 2769.8333740234375, + "epoch": 0.7534246575342466, + "grad_norm": 0.13973061740398407, + "kl": 0.035400390625, + "learning_rate": 2.812786337337463e-07, + "loss": 0.0014, + "reward": 0.5715092867612839, + "reward_std": 0.45521561801433563, + "rewards/cosine_scaled_reward": 0.022898193448781967, + "rewards/format_reward": 0.5486111342906952, + "step": 220 + }, + { + "completion_length": 2313.9097900390625, + "epoch": 0.7568493150684932, + "grad_norm": 0.1722535640001297, + "kl": 0.0362548828125, + "learning_rate": 2.770970115705341e-07, + "loss": 0.0014, + "reward": 0.6356571912765503, + "reward_std": 0.31826692819595337, + "rewards/cosine_scaled_reward": -0.017120573669672012, + "rewards/format_reward": 0.6527777910232544, + "step": 221 + }, + { + "completion_length": 2617.8333740234375, + "epoch": 0.7602739726027398, + "grad_norm": 0.1793624311685562, + "kl": 0.0341796875, + "learning_rate": 2.729523361034538e-07, + "loss": 0.0014, + "reward": 0.5496674627065659, + "reward_std": 0.45733560621738434, + "rewards/cosine_scaled_reward": 0.01494525047019124, + "rewards/format_reward": 0.5347222089767456, + "step": 222 + }, + { + "completion_length": 2766.7291259765625, + "epoch": 0.7636986301369864, + "grad_norm": 0.1420270800590515, + "kl": 0.02960205078125, + "learning_rate": 2.68845168455218e-07, + "loss": 0.0012, + "reward": 0.6161434650421143, + "reward_std": 0.4993426203727722, + "rewards/cosine_scaled_reward": 0.07447678688913584, + "rewards/format_reward": 0.5416666865348816, + "step": 223 + }, + { + "completion_length": 2775.375, + "epoch": 0.7671232876712328, + "grad_norm": 0.15780124068260193, + "kl": 0.0301513671875, + "learning_rate": 2.6477606467058035e-07, + "loss": 0.0012, + "reward": 0.5157144665718079, + "reward_std": 0.3755457401275635, + "rewards/cosine_scaled_reward": 0.03654780611395836, + "rewards/format_reward": 0.4791666716337204, + "step": 224 + }, + { + "completion_length": 2975.041748046875, + "epoch": 0.7705479452054794, + "grad_norm": 0.13926586508750916, + "kl": 0.03179931640625, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0013, + "reward": 0.19908806309103966, + "reward_std": 0.42198337614536285, + "rewards/cosine_scaled_reward": -0.14813418313860893, + "rewards/format_reward": 0.3472222238779068, + "step": 225 + }, + { + "completion_length": 3097.52783203125, + "epoch": 0.773972602739726, + "grad_norm": 0.1462773233652115, + "kl": 0.028076171875, + "learning_rate": 2.567542470303452e-07, + "loss": 0.0011, + "reward": 0.4122817665338516, + "reward_std": 0.5014311969280243, + "rewards/cosine_scaled_reward": -0.0321626765653491, + "rewards/format_reward": 0.4444444477558136, + "step": 226 + }, + { + "completion_length": 2771.5556640625, + "epoch": 0.7773972602739726, + "grad_norm": 0.1374952346086502, + "kl": 0.0338134765625, + "learning_rate": 2.528026192004466e-07, + "loss": 0.0014, + "reward": 0.4719505310058594, + "reward_std": 0.49139489233493805, + "rewards/cosine_scaled_reward": -0.014160582795739174, + "rewards/format_reward": 0.486111119389534, + "step": 227 + }, + { + "completion_length": 2911.5833740234375, + "epoch": 0.7808219178082192, + "grad_norm": 0.1661740243434906, + "kl": 0.0330810546875, + "learning_rate": 2.488912271385139e-07, + "loss": 0.0013, + "reward": 0.3701959401369095, + "reward_std": 0.479979932308197, + "rewards/cosine_scaled_reward": -0.06730403914116323, + "rewards/format_reward": 0.4375, + "step": 228 + }, + { + "completion_length": 2854.826416015625, + "epoch": 0.7842465753424658, + "grad_norm": 0.1530180722475052, + "kl": 0.03662109375, + "learning_rate": 2.450206003844205e-07, + "loss": 0.0015, + "reward": 0.47989606857299805, + "reward_std": 0.40031544864177704, + "rewards/cosine_scaled_reward": -0.027048394083976746, + "rewards/format_reward": 0.5069444626569748, + "step": 229 + }, + { + "completion_length": 3026.8055419921875, + "epoch": 0.7876712328767124, + "grad_norm": 0.12480524182319641, + "kl": 0.0284423828125, + "learning_rate": 2.411912629590699e-07, + "loss": 0.0011, + "reward": 0.38817086815834045, + "reward_std": 0.41600513458251953, + "rewards/cosine_scaled_reward": -0.014606935903429985, + "rewards/format_reward": 0.4027777761220932, + "step": 230 + }, + { + "completion_length": 2905.3333740234375, + "epoch": 0.791095890410959, + "grad_norm": 0.14893729984760284, + "kl": 0.033203125, + "learning_rate": 2.374037332934512e-07, + "loss": 0.0013, + "reward": 0.4625111222267151, + "reward_std": 0.45194628834724426, + "rewards/cosine_scaled_reward": -0.023599994368851185, + "rewards/format_reward": 0.4861111342906952, + "step": 231 + }, + { + "completion_length": 2417.1319580078125, + "epoch": 0.7945205479452054, + "grad_norm": 0.1770123839378357, + "kl": 0.0350341796875, + "learning_rate": 2.336585241584522e-07, + "loss": 0.0014, + "reward": 0.7192187607288361, + "reward_std": 0.39792077243328094, + "rewards/cosine_scaled_reward": 0.059496549889445305, + "rewards/format_reward": 0.659722238779068, + "step": 232 + }, + { + "completion_length": 3020.0833740234375, + "epoch": 0.797945205479452, + "grad_norm": 0.1579238921403885, + "kl": 0.0362548828125, + "learning_rate": 2.299561425954383e-07, + "loss": 0.0015, + "reward": 0.3186444193124771, + "reward_std": 0.5212821513414383, + "rewards/cosine_scaled_reward": -0.11885556951165199, + "rewards/format_reward": 0.4375, + "step": 233 + }, + { + "completion_length": 2965.21533203125, + "epoch": 0.8013698630136986, + "grad_norm": 0.13462784886360168, + "kl": 0.03240966796875, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.0013, + "reward": 0.4752238541841507, + "reward_std": 0.40988166630268097, + "rewards/cosine_scaled_reward": 0.003001643344759941, + "rewards/format_reward": 0.4722222238779068, + "step": 234 + }, + { + "completion_length": 2902.916748046875, + "epoch": 0.8047945205479452, + "grad_norm": 0.13331294059753418, + "kl": 0.02911376953125, + "learning_rate": 2.2268186129212807e-07, + "loss": 0.0012, + "reward": 0.3500567376613617, + "reward_std": 0.4639824479818344, + "rewards/cosine_scaled_reward": -0.0735543726477772, + "rewards/format_reward": 0.4236111044883728, + "step": 235 + }, + { + "completion_length": 2946.5, + "epoch": 0.8082191780821918, + "grad_norm": 0.14252088963985443, + "kl": 0.03155517578125, + "learning_rate": 2.1911094637307714e-07, + "loss": 0.0013, + "reward": 0.3601393699645996, + "reward_std": 0.40262600779533386, + "rewards/cosine_scaled_reward": -0.05652729608118534, + "rewards/format_reward": 0.4166666716337204, + "step": 236 + }, + { + "completion_length": 3064.47216796875, + "epoch": 0.8116438356164384, + "grad_norm": 0.16358421742916107, + "kl": 0.03594970703125, + "learning_rate": 2.1558482853517253e-07, + "loss": 0.0014, + "reward": 0.3410459593869746, + "reward_std": 0.3948727697134018, + "rewards/cosine_scaled_reward": -0.020065151154994965, + "rewards/format_reward": 0.3611111119389534, + "step": 237 + }, + { + "completion_length": 2980.416748046875, + "epoch": 0.815068493150685, + "grad_norm": 0.36835718154907227, + "kl": 0.039306640625, + "learning_rate": 2.1210398515832536e-07, + "loss": 0.0016, + "reward": 0.2538940832018852, + "reward_std": 0.3796728700399399, + "rewards/cosine_scaled_reward": -0.10721703246235847, + "rewards/format_reward": 0.3611111119389534, + "step": 238 + }, + { + "completion_length": 2511.3333740234375, + "epoch": 0.8184931506849316, + "grad_norm": 0.15058235824108124, + "kl": 0.03302001953125, + "learning_rate": 2.08668887493009e-07, + "loss": 0.0013, + "reward": 0.42302054166793823, + "reward_std": 0.35738538205623627, + "rewards/cosine_scaled_reward": -0.11864613555371761, + "rewards/format_reward": 0.5416666865348816, + "step": 239 + }, + { + "completion_length": 2788.2430419921875, + "epoch": 0.821917808219178, + "grad_norm": 0.15094441175460815, + "kl": 0.03497314453125, + "learning_rate": 2.0528000059645995e-07, + "loss": 0.0014, + "reward": 0.4362386465072632, + "reward_std": 0.37140533328056335, + "rewards/cosine_scaled_reward": -0.04987248365068808, + "rewards/format_reward": 0.486111119389534, + "step": 240 + }, + { + "completion_length": 2689.46533203125, + "epoch": 0.8253424657534246, + "grad_norm": 0.15874545276165009, + "kl": 0.03240966796875, + "learning_rate": 2.0193778326971628e-07, + "loss": 0.0013, + "reward": 0.44388699531555176, + "reward_std": 0.3412891924381256, + "rewards/cosine_scaled_reward": -0.08389079011976719, + "rewards/format_reward": 0.5277777910232544, + "step": 241 + }, + { + "completion_length": 3123.8680419921875, + "epoch": 0.8287671232876712, + "grad_norm": 0.1301407366991043, + "kl": 0.0311279296875, + "learning_rate": 1.986426879955034e-07, + "loss": 0.0012, + "reward": 0.21631913632154465, + "reward_std": 0.39288755506277084, + "rewards/cosine_scaled_reward": -0.15173641964793205, + "rewards/format_reward": 0.3680555522441864, + "step": 242 + }, + { + "completion_length": 2796.3056640625, + "epoch": 0.8321917808219178, + "grad_norm": 0.1545010507106781, + "kl": 0.038818359375, + "learning_rate": 1.9539516087697517e-07, + "loss": 0.0016, + "reward": 0.3822927325963974, + "reward_std": 0.5006706863641739, + "rewards/cosine_scaled_reward": -0.08298505656421185, + "rewards/format_reward": 0.4652777761220932, + "step": 243 + }, + { + "completion_length": 2943.77783203125, + "epoch": 0.8356164383561644, + "grad_norm": 0.16031137108802795, + "kl": 0.0350341796875, + "learning_rate": 1.9219564157731844e-07, + "loss": 0.0014, + "reward": 0.3758165240287781, + "reward_std": 0.3894062936306, + "rewards/cosine_scaled_reward": -0.0339057189412415, + "rewards/format_reward": 0.409722238779068, + "step": 244 + }, + { + "completion_length": 2864.8819580078125, + "epoch": 0.839041095890411, + "grad_norm": 0.14703238010406494, + "kl": 0.0367431640625, + "learning_rate": 1.8904456326023027e-07, + "loss": 0.0015, + "reward": 0.3677906394004822, + "reward_std": 0.2914382070302963, + "rewards/cosine_scaled_reward": -0.06276494171470404, + "rewards/format_reward": 0.4305555671453476, + "step": 245 + }, + { + "completion_length": 2885.84033203125, + "epoch": 0.8424657534246576, + "grad_norm": 0.13430039584636688, + "kl": 0.03369140625, + "learning_rate": 1.8594235253127372e-07, + "loss": 0.0013, + "reward": 0.39504893124103546, + "reward_std": 0.447158083319664, + "rewards/cosine_scaled_reward": -0.07717327354475856, + "rewards/format_reward": 0.4722222238779068, + "step": 246 + }, + { + "completion_length": 2797.9583740234375, + "epoch": 0.8458904109589042, + "grad_norm": 0.15996281802654266, + "kl": 0.03466796875, + "learning_rate": 1.8288942938012267e-07, + "loss": 0.0014, + "reward": 0.4512728601694107, + "reward_std": 0.49667105078697205, + "rewards/cosine_scaled_reward": -0.03483825922012329, + "rewards/format_reward": 0.4861111044883728, + "step": 247 + }, + { + "completion_length": 2761.7362060546875, + "epoch": 0.8493150684931506, + "grad_norm": 0.16140304505825043, + "kl": 0.0394287109375, + "learning_rate": 1.7988620712370195e-07, + "loss": 0.0016, + "reward": 0.5905152261257172, + "reward_std": 0.4745737165212631, + "rewards/cosine_scaled_reward": 0.03495965828187764, + "rewards/format_reward": 0.5555555522441864, + "step": 248 + }, + { + "completion_length": 2945.3055419921875, + "epoch": 0.8527397260273972, + "grad_norm": 0.14055365324020386, + "kl": 0.0325927734375, + "learning_rate": 1.7693309235023127e-07, + "loss": 0.0013, + "reward": 0.5377485156059265, + "reward_std": 0.5574042201042175, + "rewards/cosine_scaled_reward": 0.023859622422605753, + "rewards/format_reward": 0.5138888955116272, + "step": 249 + }, + { + "completion_length": 2663.6458740234375, + "epoch": 0.8561643835616438, + "grad_norm": 0.17629733681678772, + "kl": 0.03955078125, + "learning_rate": 1.7403048486417868e-07, + "loss": 0.0016, + "reward": 0.5798373818397522, + "reward_std": 0.5406672060489655, + "rewards/cosine_scaled_reward": 0.06594848074018955, + "rewards/format_reward": 0.5138888955116272, + "step": 250 + }, + { + "completion_length": 2895.1944580078125, + "epoch": 0.8595890410958904, + "grad_norm": 0.18740905821323395, + "kl": 0.03955078125, + "learning_rate": 1.711787776321341e-07, + "loss": 0.0016, + "reward": 0.3543848991394043, + "reward_std": 0.5366209447383881, + "rewards/cosine_scaled_reward": -0.09700398705899715, + "rewards/format_reward": 0.451388880610466, + "step": 251 + }, + { + "completion_length": 2812.7222900390625, + "epoch": 0.863013698630137, + "grad_norm": 0.13896240293979645, + "kl": 0.0306396484375, + "learning_rate": 1.6837835672960831e-07, + "loss": 0.0012, + "reward": 0.49542590975761414, + "reward_std": 0.42443887889385223, + "rewards/cosine_scaled_reward": -0.018462970852851868, + "rewards/format_reward": 0.513888880610466, + "step": 252 + }, + { + "completion_length": 2959.5069580078125, + "epoch": 0.8664383561643836, + "grad_norm": 0.1251368522644043, + "kl": 0.03564453125, + "learning_rate": 1.6562960128876353e-07, + "loss": 0.0014, + "reward": 0.2707902789115906, + "reward_std": 0.5160115361213684, + "rewards/cosine_scaled_reward": -0.14587640017271042, + "rewards/format_reward": 0.4166666567325592, + "step": 253 + }, + { + "completion_length": 2675.423583984375, + "epoch": 0.8698630136986302, + "grad_norm": 0.15856920182704926, + "kl": 0.0364990234375, + "learning_rate": 1.6293288344708566e-07, + "loss": 0.0015, + "reward": 0.6221481561660767, + "reward_std": 0.3958921879529953, + "rewards/cosine_scaled_reward": 0.03881483152508736, + "rewards/format_reward": 0.5833333432674408, + "step": 254 + }, + { + "completion_length": 3005.2362060546875, + "epoch": 0.8732876712328768, + "grad_norm": 0.1423628181219101, + "kl": 0.0340576171875, + "learning_rate": 1.6028856829700258e-07, + "loss": 0.0014, + "reward": 0.20419325679540634, + "reward_std": 0.3235137313604355, + "rewards/cosine_scaled_reward": -0.17080675438046455, + "rewards/format_reward": 0.375, + "step": 255 + }, + { + "completion_length": 2853.2222900390625, + "epoch": 0.8767123287671232, + "grad_norm": 0.18754243850708008, + "kl": 0.0416259765625, + "learning_rate": 1.5769701383645698e-07, + "loss": 0.0017, + "reward": 0.5054954886436462, + "reward_std": 0.41478313505649567, + "rewards/cosine_scaled_reward": 0.047162143513560295, + "rewards/format_reward": 0.4583333283662796, + "step": 256 + }, + { + "completion_length": 3062.604248046875, + "epoch": 0.8801369863013698, + "grad_norm": 0.14577165246009827, + "kl": 0.035888671875, + "learning_rate": 1.551585709204381e-07, + "loss": 0.0014, + "reward": 0.5648697018623352, + "reward_std": 0.6114741563796997, + "rewards/cosine_scaled_reward": 0.0787586160004139, + "rewards/format_reward": 0.4861111044883728, + "step": 257 + }, + { + "completion_length": 2579.1944580078125, + "epoch": 0.8835616438356164, + "grad_norm": 0.14230208098888397, + "kl": 0.03363037109375, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.0013, + "reward": 0.5974612534046173, + "reward_std": 0.35620053112506866, + "rewards/cosine_scaled_reward": 0.0557946152985096, + "rewards/format_reward": 0.5416666865348816, + "step": 258 + }, + { + "completion_length": 2702.3194580078125, + "epoch": 0.886986301369863, + "grad_norm": 0.1390962451696396, + "kl": 0.033935546875, + "learning_rate": 1.5024238714314825e-07, + "loss": 0.0014, + "reward": 0.6885968148708344, + "reward_std": 0.4262382835149765, + "rewards/cosine_scaled_reward": 0.09137461334466934, + "rewards/format_reward": 0.597222238779068, + "step": 259 + }, + { + "completion_length": 2968.2362060546875, + "epoch": 0.8904109589041096, + "grad_norm": 0.1457592248916626, + "kl": 0.03399658203125, + "learning_rate": 1.4786531185446452e-07, + "loss": 0.0014, + "reward": 0.33401037007570267, + "reward_std": 0.33265479654073715, + "rewards/cosine_scaled_reward": -0.00626740138977766, + "rewards/format_reward": 0.3402777761220932, + "step": 260 + }, + { + "completion_length": 3024.8680419921875, + "epoch": 0.8938356164383562, + "grad_norm": 0.14392928779125214, + "kl": 0.0400390625, + "learning_rate": 1.4554267916537495e-07, + "loss": 0.0016, + "reward": 0.30607690662145615, + "reward_std": 0.3898402601480484, + "rewards/cosine_scaled_reward": -0.027256430126726627, + "rewards/format_reward": 0.3333333432674408, + "step": 261 + }, + { + "completion_length": 2864.013916015625, + "epoch": 0.8972602739726028, + "grad_norm": 0.12024541944265366, + "kl": 0.0279541015625, + "learning_rate": 1.432748035231658e-07, + "loss": 0.0011, + "reward": 0.5254741907119751, + "reward_std": 0.46267665922641754, + "rewards/cosine_scaled_reward": -0.0231369249522686, + "rewards/format_reward": 0.548611119389534, + "step": 262 + }, + { + "completion_length": 2763.9583740234375, + "epoch": 0.9006849315068494, + "grad_norm": 0.24614769220352173, + "kl": 0.040771484375, + "learning_rate": 1.4106199196189608e-07, + "loss": 0.0016, + "reward": 0.35452011227607727, + "reward_std": 0.47811339795589447, + "rewards/cosine_scaled_reward": -0.09686877019703388, + "rewards/format_reward": 0.4513889104127884, + "step": 263 + }, + { + "completion_length": 2571.0, + "epoch": 0.9041095890410958, + "grad_norm": 0.14297829568386078, + "kl": 0.03631591796875, + "learning_rate": 1.3890454406082956e-07, + "loss": 0.0015, + "reward": 0.49256862699985504, + "reward_std": 0.3805427849292755, + "rewards/cosine_scaled_reward": -0.0074313730001449585, + "rewards/format_reward": 0.5, + "step": 264 + }, + { + "completion_length": 2234.326416015625, + "epoch": 0.9075342465753424, + "grad_norm": 0.15764550864696503, + "kl": 0.032470703125, + "learning_rate": 1.3680275190387675e-07, + "loss": 0.0013, + "reward": 0.7705516219139099, + "reward_std": 0.4592142701148987, + "rewards/cosine_scaled_reward": 0.05527384765446186, + "rewards/format_reward": 0.7152777910232544, + "step": 265 + }, + { + "completion_length": 2849.0347900390625, + "epoch": 0.910958904109589, + "grad_norm": 0.13773973286151886, + "kl": 0.03289794921875, + "learning_rate": 1.3475690004005097e-07, + "loss": 0.0013, + "reward": 0.5077018439769745, + "reward_std": 0.3785110265016556, + "rewards/cosine_scaled_reward": -0.013131474610418081, + "rewards/format_reward": 0.5208333432674408, + "step": 266 + }, + { + "completion_length": 3086.2431640625, + "epoch": 0.9143835616438356, + "grad_norm": 0.1191553846001625, + "kl": 0.0340576171875, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.0014, + "reward": 0.38693176954984665, + "reward_std": 0.45597271621227264, + "rewards/cosine_scaled_reward": -0.1269571604207158, + "rewards/format_reward": 0.513888880610466, + "step": 267 + }, + { + "completion_length": 2707.8681640625, + "epoch": 0.9178082191780822, + "grad_norm": 0.1543823927640915, + "kl": 0.0347900390625, + "learning_rate": 1.308341174832359e-07, + "loss": 0.0014, + "reward": 0.6485514044761658, + "reward_std": 0.4456641525030136, + "rewards/cosine_scaled_reward": 0.03744027949869633, + "rewards/format_reward": 0.6111111044883728, + "step": 268 + }, + { + "completion_length": 2430.388916015625, + "epoch": 0.9212328767123288, + "grad_norm": 0.20193035900592804, + "kl": 0.040283203125, + "learning_rate": 1.2895771787221088e-07, + "loss": 0.0016, + "reward": 0.5673209726810455, + "reward_std": 0.47004686295986176, + "rewards/cosine_scaled_reward": -0.009067919105291367, + "rewards/format_reward": 0.5763888955116272, + "step": 269 + }, + { + "completion_length": 2542.5208740234375, + "epoch": 0.9246575342465754, + "grad_norm": 0.14499835669994354, + "kl": 0.036376953125, + "learning_rate": 1.2713832064634125e-07, + "loss": 0.0015, + "reward": 0.6222769916057587, + "reward_std": 0.46079638600349426, + "rewards/cosine_scaled_reward": 0.05283256620168686, + "rewards/format_reward": 0.5694444477558136, + "step": 270 + }, + { + "completion_length": 2701.8056640625, + "epoch": 0.928082191780822, + "grad_norm": 0.16896043717861176, + "kl": 0.0341796875, + "learning_rate": 1.2537617212288742e-07, + "loss": 0.0014, + "reward": 0.36604734510183334, + "reward_std": 0.361560583114624, + "rewards/cosine_scaled_reward": -0.0922860149294138, + "rewards/format_reward": 0.4583333432674408, + "step": 271 + }, + { + "completion_length": 2764.4583740234375, + "epoch": 0.9315068493150684, + "grad_norm": 0.16842736303806305, + "kl": 0.0345458984375, + "learning_rate": 1.2367151086855187e-07, + "loss": 0.0014, + "reward": 0.5705253630876541, + "reward_std": 0.5290426909923553, + "rewards/cosine_scaled_reward": 0.07746978849172592, + "rewards/format_reward": 0.4930555522441864, + "step": 272 + }, + { + "completion_length": 2728.6597900390625, + "epoch": 0.934931506849315, + "grad_norm": 0.1411057859659195, + "kl": 0.0361328125, + "learning_rate": 1.220245676671809e-07, + "loss": 0.0014, + "reward": 0.5213780552148819, + "reward_std": 0.5097576379776001, + "rewards/cosine_scaled_reward": -0.02723303623497486, + "rewards/format_reward": 0.5486111044883728, + "step": 273 + }, + { + "completion_length": 3240.3055419921875, + "epoch": 0.9383561643835616, + "grad_norm": 0.1661859005689621, + "kl": 0.03228759765625, + "learning_rate": 1.2043556548852063e-07, + "loss": 0.0013, + "reward": 0.25158608704805374, + "reward_std": 0.4693699926137924, + "rewards/cosine_scaled_reward": -0.09563614055514336, + "rewards/format_reward": 0.3472222238779068, + "step": 274 + }, + { + "completion_length": 2608.8472900390625, + "epoch": 0.9417808219178082, + "grad_norm": 0.14709579944610596, + "kl": 0.0390625, + "learning_rate": 1.1890471945802999e-07, + "loss": 0.0016, + "reward": 0.5507668703794479, + "reward_std": 0.32081814110279083, + "rewards/cosine_scaled_reward": 0.0021557584404945374, + "rewards/format_reward": 0.548611119389534, + "step": 275 + }, + { + "completion_length": 2953.8055419921875, + "epoch": 0.9452054794520548, + "grad_norm": 0.13219843804836273, + "kl": 0.035888671875, + "learning_rate": 1.1743223682775649e-07, + "loss": 0.0014, + "reward": 0.4363091439008713, + "reward_std": 0.5200669467449188, + "rewards/cosine_scaled_reward": -0.0428575212135911, + "rewards/format_reward": 0.4791666865348816, + "step": 276 + }, + { + "completion_length": 2877.375, + "epoch": 0.9486301369863014, + "grad_norm": 0.1316324770450592, + "kl": 0.038818359375, + "learning_rate": 1.160183169482775e-07, + "loss": 0.0016, + "reward": 0.5365364849567413, + "reward_std": 0.5149263441562653, + "rewards/cosine_scaled_reward": 0.05042533949017525, + "rewards/format_reward": 0.4861111342906952, + "step": 277 + }, + { + "completion_length": 2528.6944580078125, + "epoch": 0.952054794520548, + "grad_norm": 0.21519577503204346, + "kl": 0.03662109375, + "learning_rate": 1.1466315124171128e-07, + "loss": 0.0015, + "reward": 0.4645202308893204, + "reward_std": 0.4477800279855728, + "rewards/cosine_scaled_reward": -0.07714640907943249, + "rewards/format_reward": 0.5416666716337204, + "step": 278 + }, + { + "completion_length": 2701.9862060546875, + "epoch": 0.9554794520547946, + "grad_norm": 0.17575909197330475, + "kl": 0.0399169921875, + "learning_rate": 1.1336692317580158e-07, + "loss": 0.0016, + "reward": 0.40750324726104736, + "reward_std": 0.4330911487340927, + "rewards/cosine_scaled_reward": -0.07166343554854393, + "rewards/format_reward": 0.4791666716337204, + "step": 279 + }, + { + "completion_length": 2913.5, + "epoch": 0.958904109589041, + "grad_norm": 0.1264602094888687, + "kl": 0.03594970703125, + "learning_rate": 1.1212980823907929e-07, + "loss": 0.0014, + "reward": 0.5513550490140915, + "reward_std": 0.5005469620227814, + "rewards/cosine_scaled_reward": 0.04441063478589058, + "rewards/format_reward": 0.5069444626569748, + "step": 280 + }, + { + "completion_length": 2647.451416015625, + "epoch": 0.9623287671232876, + "grad_norm": 0.2159995436668396, + "kl": 0.0328369140625, + "learning_rate": 1.1095197391710362e-07, + "loss": 0.0013, + "reward": 0.5046227127313614, + "reward_std": 0.4968739449977875, + "rewards/cosine_scaled_reward": -0.01621063333004713, + "rewards/format_reward": 0.5208333283662796, + "step": 281 + }, + { + "completion_length": 2636.2708740234375, + "epoch": 0.9657534246575342, + "grad_norm": 0.16840696334838867, + "kl": 0.0361328125, + "learning_rate": 1.0983357966978745e-07, + "loss": 0.0014, + "reward": 0.4477865546941757, + "reward_std": 0.47679105401039124, + "rewards/cosine_scaled_reward": -0.0730467566754669, + "rewards/format_reward": 0.5208333283662796, + "step": 282 + }, + { + "completion_length": 2957.4930419921875, + "epoch": 0.9691780821917808, + "grad_norm": 0.14714959263801575, + "kl": 0.03460693359375, + "learning_rate": 1.0877477690980931e-07, + "loss": 0.0014, + "reward": 0.35540203750133514, + "reward_std": 0.47071878612041473, + "rewards/cosine_scaled_reward": -0.09598685055971146, + "rewards/format_reward": 0.4513888955116272, + "step": 283 + }, + { + "completion_length": 2786.5208740234375, + "epoch": 0.9726027397260274, + "grad_norm": 0.1734078973531723, + "kl": 0.044677734375, + "learning_rate": 1.0777570898211405e-07, + "loss": 0.0018, + "reward": 0.3655647486448288, + "reward_std": 0.4497402310371399, + "rewards/cosine_scaled_reward": -0.05110191088169813, + "rewards/format_reward": 0.4166666716337204, + "step": 284 + }, + { + "completion_length": 2854.5069580078125, + "epoch": 0.976027397260274, + "grad_norm": 0.140737384557724, + "kl": 0.0367431640625, + "learning_rate": 1.068365111445064e-07, + "loss": 0.0015, + "reward": 0.3777775317430496, + "reward_std": 0.4247249662876129, + "rewards/cosine_scaled_reward": -0.09444468468427658, + "rewards/format_reward": 0.4722222238779068, + "step": 285 + }, + { + "completion_length": 2879.7501220703125, + "epoch": 0.9794520547945206, + "grad_norm": 0.15024779736995697, + "kl": 0.0333251953125, + "learning_rate": 1.0595731054933934e-07, + "loss": 0.0013, + "reward": 0.24627424031496048, + "reward_std": 0.3542858809232712, + "rewards/cosine_scaled_reward": -0.12178133055567741, + "rewards/format_reward": 0.3680555671453476, + "step": 286 + }, + { + "completion_length": 2901.6944580078125, + "epoch": 0.9828767123287672, + "grad_norm": 0.12313710153102875, + "kl": 0.03375244140625, + "learning_rate": 1.0513822622629978e-07, + "loss": 0.0013, + "reward": 0.5626899600028992, + "reward_std": 0.47839629650115967, + "rewards/cosine_scaled_reward": 0.06268996931612492, + "rewards/format_reward": 0.5000000149011612, + "step": 287 + }, + { + "completion_length": 2720.4097900390625, + "epoch": 0.9863013698630136, + "grad_norm": 0.13372260332107544, + "kl": 0.03326416015625, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.0013, + "reward": 0.42101193219423294, + "reward_std": 0.44517213106155396, + "rewards/cosine_scaled_reward": -0.07898806827142835, + "rewards/format_reward": 0.5, + "step": 288 + }, + { + "completion_length": 2722.4305419921875, + "epoch": 0.9897260273972602, + "grad_norm": 0.18578775227069855, + "kl": 0.03375244140625, + "learning_rate": 1.0368084180643224e-07, + "loss": 0.0013, + "reward": 0.574064090847969, + "reward_std": 0.4914132207632065, + "rewards/cosine_scaled_reward": 0.018508493900299072, + "rewards/format_reward": 0.5555555522441864, + "step": 289 + }, + { + "completion_length": 2860.0208740234375, + "epoch": 0.9931506849315068, + "grad_norm": 0.16790378093719482, + "kl": 0.037841796875, + "learning_rate": 1.0304273901612565e-07, + "loss": 0.0015, + "reward": 0.4475916475057602, + "reward_std": 0.35639651119709015, + "rewards/cosine_scaled_reward": -0.05240832082927227, + "rewards/format_reward": 0.5000000149011612, + "step": 290 + }, + { + "completion_length": 2500.354248046875, + "epoch": 0.9965753424657534, + "grad_norm": 0.19861853122711182, + "kl": 0.0384521484375, + "learning_rate": 1.0246514708427701e-07, + "loss": 0.0015, + "reward": 0.5283551514148712, + "reward_std": 0.4327033758163452, + "rewards/cosine_scaled_reward": -0.03414486348628998, + "rewards/format_reward": 0.5625, + "step": 291 + }, + { + "completion_length": 3228.75, + "epoch": 1.0, + "grad_norm": 0.14693370461463928, + "kl": 0.03466796875, + "learning_rate": 1.0194814420758804e-07, + "loss": 0.0014, + "reward": 0.8688084781169891, + "reward_std": 0.7189086824655533, + "rewards/cosine_scaled_reward": -0.00619150698184967, + "rewards/format_reward": 0.875, + "step": 292 + }, + { + "completion_length": 2950.875, + "epoch": 1.0034246575342465, + "grad_norm": 0.1779322773218155, + "kl": 0.0443115234375, + "learning_rate": 1.0149180037997228e-07, + "loss": 0.0018, + "reward": 0.3629312068223953, + "reward_std": 0.47756847739219666, + "rewards/cosine_scaled_reward": -0.06067990604788065, + "rewards/format_reward": 0.423611119389534, + "step": 293 + }, + { + "completion_length": 2664.9375, + "epoch": 1.0068493150684932, + "grad_norm": 0.140543133020401, + "kl": 0.03173828125, + "learning_rate": 1.0109617738307911e-07, + "loss": 0.0013, + "reward": 0.42808833718299866, + "reward_std": 0.38089750707149506, + "rewards/cosine_scaled_reward": -0.08580057881772518, + "rewards/format_reward": 0.513888880610466, + "step": 294 + }, + { + "completion_length": 2873.826416015625, + "epoch": 1.0102739726027397, + "grad_norm": 0.14133252203464508, + "kl": 0.032958984375, + "learning_rate": 1.0076132877792932e-07, + "loss": 0.0013, + "reward": 0.463130921125412, + "reward_std": 0.6191278696060181, + "rewards/cosine_scaled_reward": -0.03686907887458801, + "rewards/format_reward": 0.5, + "step": 295 + }, + { + "completion_length": 3035.78466796875, + "epoch": 1.0136986301369864, + "grad_norm": 0.12591783702373505, + "kl": 0.03314208984375, + "learning_rate": 1.0048729989766394e-07, + "loss": 0.0013, + "reward": 0.5283119380474091, + "reward_std": 0.4726581275463104, + "rewards/cosine_scaled_reward": 0.06997863575816154, + "rewards/format_reward": 0.4583333432674408, + "step": 296 + }, + { + "completion_length": 2424.90283203125, + "epoch": 1.0171232876712328, + "grad_norm": 0.14170390367507935, + "kl": 0.0355224609375, + "learning_rate": 1.002741278414069e-07, + "loss": 0.0014, + "reward": 0.8962737321853638, + "reward_std": 0.46189258992671967, + "rewards/cosine_scaled_reward": 0.20877373218536377, + "rewards/format_reward": 0.6875000298023224, + "step": 297 + }, + { + "completion_length": 2943.513916015625, + "epoch": 1.0205479452054795, + "grad_norm": 0.1521555632352829, + "kl": 0.0386962890625, + "learning_rate": 1.0012184146924223e-07, + "loss": 0.0015, + "reward": 0.26369042694568634, + "reward_std": 0.40857334434986115, + "rewards/cosine_scaled_reward": -0.15297627449035645, + "rewards/format_reward": 0.4166666716337204, + "step": 298 + }, + { + "completion_length": 2802.791748046875, + "epoch": 1.023972602739726, + "grad_norm": 0.13754691183567047, + "kl": 0.0372314453125, + "learning_rate": 1.0003046139830701e-07, + "loss": 0.0015, + "reward": 0.6113243997097015, + "reward_std": 0.4505104422569275, + "rewards/cosine_scaled_reward": 0.055768875405192375, + "rewards/format_reward": 0.5555555820465088, + "step": 299 + }, + { + "completion_length": 2618.9722900390625, + "epoch": 1.0273972602739727, + "grad_norm": 0.15953682363033295, + "kl": 0.0386962890625, + "learning_rate": 1e-07, + "loss": 0.0015, + "reward": 0.6579746007919312, + "reward_std": 0.4734661132097244, + "rewards/cosine_scaled_reward": 0.08158569037914276, + "rewards/format_reward": 0.5763888955116272, + "step": 300 + }, + { + "epoch": 1.0273972602739727, + "step": 300, + "total_flos": 0.0, + "train_loss": 0.0008326698157195504, + "train_runtime": 30165.0377, + "train_samples_per_second": 0.239, + "train_steps_per_second": 0.01 + } + ], + "logging_steps": 1, + "max_steps": 300, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..955008b --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb13a76f265b14a21f38f645e2ef5a8b3b28cea3d8f99b0995ebab8a171ac74f +size 7864