From 63e11c1fde5e5d4880327371cd6bbd06f2929434 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Thu, 14 May 2026 18:02:49 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: kangdawei/DAPO-No-DS Source: Original Platform --- .gitattributes | 36 + README.md | 70 + all_results.json | 8 + chat_template.jinja | 1 + config.json | 60 + generation_config.json | 12 + model.safetensors | 3 + special_tokens_map.json | 23 + tokenizer.json | 3 + tokenizer_config.json | 194 +++ train_results.json | 8 + trainer_state.json | 2843 +++++++++++++++++++++++++++++++++++++++ training_args.bin | 3 + 13 files changed, 3264 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 all_results.json create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_results.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..0588e40 --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +datasets: knoveleng/open-rs +library_name: transformers +model_name: DAPO-No-DS +tags: +- generated_from_trainer +- open-r1 +- dapo +- trl +licence: license +--- + +# Model Card for DAPO-No-DS + +This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) on the [knoveleng/open-rs](https://huggingface.co/datasets/knoveleng/open-rs) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="kangdawei/DAPO-No-DS", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + + + + +This model was trained with DAPO, a method introduced in [DAPO: An Open-Source LLM Reinforcement Learning System at Scale](https://huggingface.co/papers/2503.14476). + +### Framework versions + +- TRL: 0.16.0.dev0 +- Transformers: 4.57.1 +- Pytorch: 2.5.1 +- Datasets: 3.2.0 +- Tokenizers: 0.22.1 + +## Citations + +Cite DAPO as: + +```bibtex +@article{yu2025dapo, + title = {{DAPO: An Open-Source LLM Reinforcement Learning System at Scale}}, + author = {Qiying Yu and Zheng Zhang and others}, + year = 2025, + eprint = {arXiv:2503.14476}, +} + +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..fcf3493 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": 0.024151897984629613, + "train_runtime": 32217.4606, + "train_samples": 7000, + "train_samples_per_second": 0.298, + "train_steps_per_second": 0.006 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..c2066bd --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..141d783 --- /dev/null +++ b/config.json @@ -0,0 +1,60 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151646, + "dtype": "bfloat16", + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "4.57.1", + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..acaf452 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 151646, + "do_sample": true, + "eos_token_id": [ + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_p": 0.95, + "transformers_version": "4.57.1" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..00641a3 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34c57bcd14183f5ffced275e21b2f2e92a3928ba471c42ee25bd916d1cc49416 +size 3554214752 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..1d385d6 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..e7cd2c1 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a +size 11422959 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..d252dd4 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..fcf3493 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": 0.024151897984629613, + "train_runtime": 32217.4606, + "train_samples": 7000, + "train_samples_per_second": 0.298, + "train_steps_per_second": 0.006 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..6c2fcb4 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,2843 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.22857142857142856, + "eval_steps": 500, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_fraction": 0.0, + "completion_length": 2571.2083587646484, + "epoch": 0.001142857142857143, + "grad_norm": 0.25632715225219727, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.1082, + "reward": 0.21363236638717353, + "reward_std": 0.5417899899184704, + "rewards/accuracy_reward": 0.22916667349636555, + "rewards/cosine_scaled_reward": -0.015534311532974243, + "step": 1 + }, + { + "clip_fraction": 0.0, + "completion_length": 2804.395881652832, + "epoch": 0.002285714285714286, + "grad_norm": 0.2158345729112625, + "kl": 0.0, + "learning_rate": 5e-08, + "loss": 0.0476, + "reward": 0.179365461692214, + "reward_std": 0.24326776154339314, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/cosine_scaled_reward": -0.04980122856795788, + "step": 2 + }, + { + "clip_fraction": 0.0, + "completion_length": 3340.9791717529297, + "epoch": 0.0034285714285714284, + "grad_norm": 0.1626892238855362, + "kl": 4.589557647705078e-05, + "learning_rate": 1e-07, + "loss": -0.0326, + "reward": -0.18680910766124725, + "reward_std": 0.2629435881972313, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/cosine_scaled_reward": -0.2076424416154623, + "step": 3 + }, + { + "clip_fraction": 0.0, + "completion_length": 2383.8125228881836, + "epoch": 0.004571428571428572, + "grad_norm": 0.321321964263916, + "kl": 3.878772258758545e-05, + "learning_rate": 1.5e-07, + "loss": 0.0713, + "reward": 0.037019677460193634, + "reward_std": 0.5308115221560001, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/cosine_scaled_reward": -0.10881366301327944, + "step": 4 + }, + { + "clip_fraction": 0.0, + "completion_length": 3285.6458740234375, + "epoch": 0.005714285714285714, + "grad_norm": 0.16392037272453308, + "kl": 3.78340482711792e-05, + "learning_rate": 2e-07, + "loss": 0.0192, + "reward": -0.10475460533052683, + "reward_std": 0.4349679294973612, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/cosine_scaled_reward": -0.18808794301003218, + "step": 5 + }, + { + "clip_fraction": 0.0, + "completion_length": 2898.625045776367, + "epoch": 0.006857142857142857, + "grad_norm": 0.19641205668449402, + "kl": 4.819035530090332e-05, + "learning_rate": 2.5e-07, + "loss": 0.0638, + "reward": -0.0739727895706892, + "reward_std": 0.40896974317729473, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/cosine_scaled_reward": -0.15730612399056554, + "step": 6 + }, + { + "clip_fraction": 0.0, + "completion_length": 3123.3333740234375, + "epoch": 0.008, + "grad_norm": 0.15763328969478607, + "kl": 2.3871660232543945e-05, + "learning_rate": 3e-07, + "loss": 0.0023, + "reward": 0.2681586788967252, + "reward_std": 0.4782935921102762, + "rewards/accuracy_reward": 0.27083333767950535, + "rewards/cosine_scaled_reward": -0.0026746466755867004, + "step": 7 + }, + { + "clip_fraction": 0.0, + "completion_length": 2718.1458435058594, + "epoch": 0.009142857142857144, + "grad_norm": 0.18417485058307648, + "kl": 2.504885196685791e-05, + "learning_rate": 3.5e-07, + "loss": 0.0037, + "reward": 0.43941989610902965, + "reward_std": 0.6203235052525997, + "rewards/accuracy_reward": 0.35416667349636555, + "rewards/cosine_scaled_reward": 0.08525321818888187, + "step": 8 + }, + { + "clip_fraction": 0.0, + "completion_length": 3126.979232788086, + "epoch": 0.010285714285714285, + "grad_norm": 0.1806861162185669, + "kl": 4.266202449798584e-05, + "learning_rate": 4e-07, + "loss": 0.0192, + "reward": 0.06813614349812269, + "reward_std": 0.5027568060904741, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/cosine_scaled_reward": -0.09853052510879934, + "step": 9 + }, + { + "clip_fraction": 0.0, + "completion_length": 2906.416717529297, + "epoch": 0.011428571428571429, + "grad_norm": 0.18246354162693024, + "kl": 2.8778158593922853e-05, + "learning_rate": 4.5e-07, + "loss": 0.0043, + "reward": -0.07218722999095917, + "reward_std": 0.47773855179548264, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/cosine_scaled_reward": -0.17635390162467957, + "step": 10 + }, + { + "clip_fraction": 0.0, + "completion_length": 3303.500030517578, + "epoch": 0.012571428571428572, + "grad_norm": 0.15648911893367767, + "kl": 3.5509467124938965e-05, + "learning_rate": 5e-07, + "loss": -0.0166, + "reward": -0.156071281991899, + "reward_std": 0.2669069590047002, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/cosine_scaled_reward": -0.1977379499003291, + "step": 11 + }, + { + "clip_fraction": 0.0, + "completion_length": 2357.5208740234375, + "epoch": 0.013714285714285714, + "grad_norm": 0.2004307210445404, + "kl": 3.539770841598511e-05, + "learning_rate": 5.5e-07, + "loss": 0.0214, + "reward": -0.02095029642805457, + "reward_std": 0.5913799963891506, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/cosine_scaled_reward": -0.1667836436536163, + "step": 12 + }, + { + "clip_fraction": 0.0, + "completion_length": 2870.541702270508, + "epoch": 0.014857142857142857, + "grad_norm": 0.18646906316280365, + "kl": 3.197789192199707e-05, + "learning_rate": 6e-07, + "loss": -0.0017, + "reward": 0.14097319915890694, + "reward_std": 0.35468385741114616, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/cosine_scaled_reward": -0.04652681015431881, + "step": 13 + }, + { + "clip_fraction": 0.0, + "completion_length": 2858.375015258789, + "epoch": 0.016, + "grad_norm": 0.2710908055305481, + "kl": 2.9653310775756836e-05, + "learning_rate": 6.5e-07, + "loss": 0.063, + "reward": 0.18795185792259872, + "reward_std": 0.5712584815919399, + "rewards/accuracy_reward": 0.20833334140479565, + "rewards/cosine_scaled_reward": -0.0203814753331244, + "step": 14 + }, + { + "clip_fraction": 0.0, + "completion_length": 2703.979202270508, + "epoch": 0.017142857142857144, + "grad_norm": 0.17628003656864166, + "kl": 2.5141984224319458e-05, + "learning_rate": 7e-07, + "loss": -0.0072, + "reward": 0.27426825650036335, + "reward_std": 0.38823096454143524, + "rewards/accuracy_reward": 0.2500000037252903, + "rewards/cosine_scaled_reward": 0.02426825277507305, + "step": 15 + }, + { + "clip_fraction": 0.0, + "completion_length": 3517.666717529297, + "epoch": 0.018285714285714287, + "grad_norm": 0.15789569914340973, + "kl": 3.6329030990600586e-05, + "learning_rate": 7.5e-07, + "loss": -0.0084, + "reward": -0.18198822857812047, + "reward_std": 0.3138104174286127, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/cosine_scaled_reward": -0.22365489602088928, + "step": 16 + }, + { + "clip_fraction": 0.0, + "completion_length": 2283.0417137145996, + "epoch": 0.019428571428571427, + "grad_norm": 0.2678299844264984, + "kl": 3.6092475056648254e-05, + "learning_rate": 8e-07, + "loss": 0.0346, + "reward": 0.19116858765482903, + "reward_std": 0.6768512222915888, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/cosine_scaled_reward": -0.03799809608608484, + "step": 17 + }, + { + "clip_fraction": 0.0, + "completion_length": 2891.6250534057617, + "epoch": 0.02057142857142857, + "grad_norm": 0.1524839550256729, + "kl": 9.838491678237915e-06, + "learning_rate": 8.499999999999999e-07, + "loss": 0.0422, + "reward": 0.019560502842068672, + "reward_std": 0.42486423440277576, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/cosine_scaled_reward": -0.10543949622660875, + "step": 18 + }, + { + "clip_fraction": 0.0, + "completion_length": 2730.6875381469727, + "epoch": 0.021714285714285714, + "grad_norm": 0.1984628587961197, + "kl": 2.9616057872772217e-05, + "learning_rate": 9e-07, + "loss": 0.0244, + "reward": 0.5108187086880207, + "reward_std": 0.5117705333977938, + "rewards/accuracy_reward": 0.3750000111758709, + "rewards/cosine_scaled_reward": 0.1358186900615692, + "step": 19 + }, + { + "clip_fraction": 0.0, + "completion_length": 2329.9583892822266, + "epoch": 0.022857142857142857, + "grad_norm": 0.29129523038864136, + "kl": 1.1567026376724243e-05, + "learning_rate": 9.499999999999999e-07, + "loss": 0.1144, + "reward": 0.3239101804792881, + "reward_std": 0.5062661934643984, + "rewards/accuracy_reward": 0.29166666977107525, + "rewards/cosine_scaled_reward": 0.03224351815879345, + "step": 20 + }, + { + "clip_fraction": 0.0, + "completion_length": 2660.2083587646484, + "epoch": 0.024, + "grad_norm": 0.19655585289001465, + "kl": 2.6446767151355743e-05, + "learning_rate": 1e-06, + "loss": 0.062, + "reward": 0.05283654620870948, + "reward_std": 0.39788419753313065, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/cosine_scaled_reward": -0.09299679077230394, + "step": 21 + }, + { + "clip_fraction": 0.0, + "completion_length": 1825.020896911621, + "epoch": 0.025142857142857144, + "grad_norm": 0.2982518970966339, + "kl": 2.1491199731826782e-05, + "learning_rate": 9.99931462820376e-07, + "loss": 0.0703, + "reward": 0.23684986494481564, + "reward_std": 0.4532635463401675, + "rewards/accuracy_reward": 0.27083333767950535, + "rewards/cosine_scaled_reward": -0.033983476692810655, + "step": 22 + }, + { + "clip_fraction": 0.0, + "completion_length": 2637.2708892822266, + "epoch": 0.026285714285714287, + "grad_norm": 0.17908190190792084, + "kl": 2.055056393146515e-05, + "learning_rate": 9.997258721585931e-07, + "loss": 0.0216, + "reward": 0.04677679901942611, + "reward_std": 0.5608484419062734, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/cosine_scaled_reward": -0.1198898684233427, + "step": 23 + }, + { + "clip_fraction": 0.0, + "completion_length": 2836.437530517578, + "epoch": 0.027428571428571427, + "grad_norm": 0.18375183641910553, + "kl": 2.3286789655685425e-05, + "learning_rate": 9.993832906395582e-07, + "loss": 0.0363, + "reward": 0.1852389182895422, + "reward_std": 0.5924705881625414, + "rewards/accuracy_reward": 0.22916667349636555, + "rewards/cosine_scaled_reward": -0.04392773995641619, + "step": 24 + }, + { + "clip_fraction": 0.0, + "completion_length": 2892.875045776367, + "epoch": 0.02857142857142857, + "grad_norm": 0.19979171454906464, + "kl": 3.429921343922615e-05, + "learning_rate": 9.989038226169207e-07, + "loss": -0.0319, + "reward": 0.18543785763904452, + "reward_std": 0.5226433798670769, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/cosine_scaled_reward": -0.022895504254847765, + "step": 25 + }, + { + "clip_fraction": 0.0, + "completion_length": 2954.4791870117188, + "epoch": 0.029714285714285714, + "grad_norm": 0.1611883044242859, + "kl": 2.4512410163879395e-05, + "learning_rate": 9.982876141412855e-07, + "loss": -0.0069, + "reward": 0.2805785769596696, + "reward_std": 0.33157524187117815, + "rewards/accuracy_reward": 0.25000000558793545, + "rewards/cosine_scaled_reward": 0.030578549951314926, + "step": 26 + }, + { + "clip_fraction": 0.0, + "completion_length": 3068.104232788086, + "epoch": 0.030857142857142857, + "grad_norm": 0.1950128972530365, + "kl": 8.122995495796204e-06, + "learning_rate": 9.975348529157229e-07, + "loss": 0.0453, + "reward": 0.07182890921831131, + "reward_std": 0.5058779399842024, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/cosine_scaled_reward": -0.09483776055276394, + "step": 27 + }, + { + "clip_fraction": 0.0, + "completion_length": 2946.7083587646484, + "epoch": 0.032, + "grad_norm": 0.17087508738040924, + "kl": 1.4003482647240162e-05, + "learning_rate": 9.96645768238595e-07, + "loss": -0.041, + "reward": 0.18860503658652306, + "reward_std": 0.5205447860062122, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/cosine_scaled_reward": -0.01972830155864358, + "step": 28 + }, + { + "clip_fraction": 0.0, + "completion_length": 3220.2708740234375, + "epoch": 0.03314285714285714, + "grad_norm": 0.24361945688724518, + "kl": 3.1463801860809326e-05, + "learning_rate": 9.956206309337066e-07, + "loss": -0.1063, + "reward": -0.17443929053843021, + "reward_std": 0.3365236781537533, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/cosine_scaled_reward": -0.2161059557693079, + "step": 29 + }, + { + "clip_fraction": 0.0, + "completion_length": 2893.104248046875, + "epoch": 0.03428571428571429, + "grad_norm": 0.2155693769454956, + "kl": 5.615316331386566e-05, + "learning_rate": 9.944597532678119e-07, + "loss": 0.0487, + "reward": 0.27425617165863514, + "reward_std": 0.6831754259765148, + "rewards/accuracy_reward": 0.25000000558793545, + "rewards/cosine_scaled_reward": 0.024256166070699692, + "step": 30 + }, + { + "clip_fraction": 0.0, + "completion_length": 3071.4375, + "epoch": 0.03542857142857143, + "grad_norm": 0.17095845937728882, + "kl": 4.3095555156469345e-05, + "learning_rate": 9.931634888554935e-07, + "loss": -0.0257, + "reward": -0.04954687878489494, + "reward_std": 0.3689954075962305, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/cosine_scaled_reward": -0.15371354296803474, + "step": 31 + }, + { + "clip_fraction": 0.0, + "completion_length": 3270.375030517578, + "epoch": 0.036571428571428574, + "grad_norm": 0.1867983490228653, + "kl": 4.005682421848178e-05, + "learning_rate": 9.917322325514487e-07, + "loss": -0.0114, + "reward": 0.33223521208856255, + "reward_std": 0.5476495027542114, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/cosine_scaled_reward": 0.04056852776557207, + "step": 32 + }, + { + "clip_fraction": 0.0, + "completion_length": 3300.2708740234375, + "epoch": 0.037714285714285714, + "grad_norm": 0.15105699002742767, + "kl": 0.00013676844537258148, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0197, + "reward": 0.19714780524373055, + "reward_std": 0.5669918619096279, + "rewards/accuracy_reward": 0.22916667349636555, + "rewards/cosine_scaled_reward": -0.032018861966207623, + "step": 33 + }, + { + "clip_fraction": 0.0, + "completion_length": 2588.1875610351562, + "epoch": 0.038857142857142854, + "grad_norm": 0.47045713663101196, + "kl": 0.00015014410018920898, + "learning_rate": 9.88466529153356e-07, + "loss": 0.1528, + "reward": 0.3600555928424001, + "reward_std": 0.6696161534637213, + "rewards/accuracy_reward": 0.31250000931322575, + "rewards/cosine_scaled_reward": 0.0475555956363678, + "step": 34 + }, + { + "clip_fraction": 0.0, + "completion_length": 2979.000030517578, + "epoch": 0.04, + "grad_norm": 0.21019528806209564, + "kl": 0.00010133162140846252, + "learning_rate": 9.866330768241983e-07, + "loss": 0.0436, + "reward": 0.08721516001969576, + "reward_std": 0.6073270477354527, + "rewards/accuracy_reward": 0.16666666977107525, + "rewards/cosine_scaled_reward": -0.07945150946034119, + "step": 35 + }, + { + "clip_fraction": 0.0, + "completion_length": 3349.1041870117188, + "epoch": 0.04114285714285714, + "grad_norm": 0.1755942404270172, + "kl": 0.00011576339602470398, + "learning_rate": 9.846666218300807e-07, + "loss": 0.0059, + "reward": -0.17237537261098623, + "reward_std": 0.22124690702185035, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/cosine_scaled_reward": -0.19320870749652386, + "step": 36 + }, + { + "clip_fraction": 0.0, + "completion_length": 3302.3333435058594, + "epoch": 0.04228571428571429, + "grad_norm": 0.172617107629776, + "kl": 7.249228656291962e-05, + "learning_rate": 9.825677631722435e-07, + "loss": -0.0307, + "reward": -0.2602706290781498, + "reward_std": 0.16562421713024378, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.26027063466608524, + "step": 37 + }, + { + "clip_fraction": 0.0, + "completion_length": 3232.187515258789, + "epoch": 0.04342857142857143, + "grad_norm": 0.16284486651420593, + "kl": 8.244812488555908e-05, + "learning_rate": 9.80337140183366e-07, + "loss": -0.0182, + "reward": -0.027252301573753357, + "reward_std": 0.2725130654871464, + "rewards/accuracy_reward": 0.1041666716337204, + "rewards/cosine_scaled_reward": -0.13141895458102226, + "step": 38 + }, + { + "clip_fraction": 0.0, + "completion_length": 2854.1042098999023, + "epoch": 0.044571428571428574, + "grad_norm": 0.17649812996387482, + "kl": 9.210407733917236e-05, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0249, + "reward": 0.1576530784368515, + "reward_std": 0.24235367169603705, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/cosine_scaled_reward": -0.05068026855587959, + "step": 39 + }, + { + "clip_fraction": 0.0, + "completion_length": 2369.416702270508, + "epoch": 0.045714285714285714, + "grad_norm": 0.18827204406261444, + "kl": 0.00034741777926683426, + "learning_rate": 9.754833590196926e-07, + "loss": -0.0217, + "reward": 0.06249742116779089, + "reward_std": 0.2527109682559967, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/cosine_scaled_reward": -0.0833359295502305, + "step": 40 + }, + { + "clip_fraction": 0.0, + "completion_length": 2972.229217529297, + "epoch": 0.046857142857142854, + "grad_norm": 0.17488084733486176, + "kl": 0.0001214444637298584, + "learning_rate": 9.728616793536587e-07, + "loss": -0.0016, + "reward": 0.055548351956531405, + "reward_std": 0.5645285593345761, + "rewards/accuracy_reward": 0.16666666977107525, + "rewards/cosine_scaled_reward": -0.11111831851303577, + "step": 41 + }, + { + "clip_fraction": 0.0, + "completion_length": 2735.8958473205566, + "epoch": 0.048, + "grad_norm": 0.28289923071861267, + "kl": 8.103251457214355e-05, + "learning_rate": 9.701111919237408e-07, + "loss": -0.0087, + "reward": -0.2322952365502715, + "reward_std": 0.2050471631810069, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/cosine_scaled_reward": -0.25312857422977686, + "step": 42 + }, + { + "clip_fraction": 0.0, + "completion_length": 3113.8958587646484, + "epoch": 0.04914285714285714, + "grad_norm": 0.20076647400856018, + "kl": 8.933991193771362e-05, + "learning_rate": 9.672327345550543e-07, + "loss": 0.0291, + "reward": -0.03336653020232916, + "reward_std": 0.36820667795836926, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/cosine_scaled_reward": -0.1375331999734044, + "step": 43 + }, + { + "clip_fraction": 0.0, + "completion_length": 2842.6250228881836, + "epoch": 0.05028571428571429, + "grad_norm": 0.32278913259506226, + "kl": 0.0004453342407941818, + "learning_rate": 9.64227184053598e-07, + "loss": 0.0449, + "reward": 0.2544646831229329, + "reward_std": 0.47283124178647995, + "rewards/accuracy_reward": 0.2291666753590107, + "rewards/cosine_scaled_reward": 0.025297993794083595, + "step": 44 + }, + { + "clip_fraction": 0.0, + "completion_length": 3480.7708740234375, + "epoch": 0.05142857142857143, + "grad_norm": 0.1464184671640396, + "kl": 0.00017410144209861755, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0163, + "reward": 0.20797571539878845, + "reward_std": 0.4654997680336237, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/cosine_scaled_reward": -0.0003576315939426422, + "step": 45 + }, + { + "clip_fraction": 0.0, + "completion_length": 3276.7916717529297, + "epoch": 0.052571428571428575, + "grad_norm": 0.20637758076190948, + "kl": 0.0003883976023644209, + "learning_rate": 9.578385041664925e-07, + "loss": -0.0218, + "reward": -0.2070446526631713, + "reward_std": 0.15773496124893427, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.20704465080052614, + "step": 46 + }, + { + "clip_fraction": 0.0, + "completion_length": 2968.0833740234375, + "epoch": 0.053714285714285714, + "grad_norm": 0.20068301260471344, + "kl": 0.0002564918249845505, + "learning_rate": 9.54457320834625e-07, + "loss": 0.1738, + "reward": 0.22800862789154053, + "reward_std": 0.6421588193625212, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/cosine_scaled_reward": -0.0011580577120184898, + "step": 47 + }, + { + "clip_fraction": 0.0, + "completion_length": 2857.354232788086, + "epoch": 0.054857142857142854, + "grad_norm": 0.20215612649917603, + "kl": 0.0009041652083396912, + "learning_rate": 9.509529358847654e-07, + "loss": -0.0166, + "reward": 0.11733034905046225, + "reward_std": 0.36523033399134874, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/cosine_scaled_reward": -0.04933632561005652, + "step": 48 + }, + { + "clip_fraction": 0.0, + "completion_length": 2516.229217529297, + "epoch": 0.056, + "grad_norm": 0.19533751904964447, + "kl": 0.00025360286235809326, + "learning_rate": 9.473264167865171e-07, + "loss": -0.0101, + "reward": 0.2620775690302253, + "reward_std": 0.6330471076071262, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/cosine_scaled_reward": 0.012077568098902702, + "step": 49 + }, + { + "clip_fraction": 0.0, + "completion_length": 2981.4375, + "epoch": 0.05714285714285714, + "grad_norm": 0.165849506855011, + "kl": 0.00045928824692964554, + "learning_rate": 9.43578868212728e-07, + "loss": -0.0219, + "reward": 0.291345676407218, + "reward_std": 0.4359543416649103, + "rewards/accuracy_reward": 0.29166667349636555, + "rewards/cosine_scaled_reward": -0.0003209911519661546, + "step": 50 + }, + { + "clip_fraction": 0.0, + "completion_length": 2388.0625381469727, + "epoch": 0.05828571428571429, + "grad_norm": 0.22810906171798706, + "kl": 0.0011309236288070679, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0198, + "reward": 0.11073957197368145, + "reward_std": 0.4677628371864557, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/cosine_scaled_reward": -0.055927105247974396, + "step": 51 + }, + { + "clip_fraction": 0.0, + "completion_length": 2879.5417289733887, + "epoch": 0.05942857142857143, + "grad_norm": 0.22546225786209106, + "kl": 0.0010811500251293182, + "learning_rate": 9.357252853159505e-07, + "loss": 0.0439, + "reward": 0.2536798194050789, + "reward_std": 0.6959168985486031, + "rewards/accuracy_reward": 0.2708333395421505, + "rewards/cosine_scaled_reward": -0.01715349592268467, + "step": 52 + }, + { + "clip_fraction": 0.0, + "completion_length": 2816.3125610351562, + "epoch": 0.060571428571428575, + "grad_norm": 0.22971203923225403, + "kl": 0.0007025115191936493, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0842, + "reward": 0.30444883555173874, + "reward_std": 0.6236651353538036, + "rewards/accuracy_reward": 0.2916666753590107, + "rewards/cosine_scaled_reward": 0.01278215617639944, + "step": 53 + }, + { + "clip_fraction": 0.0, + "completion_length": 2990.541732788086, + "epoch": 0.061714285714285715, + "grad_norm": 0.17176969349384308, + "kl": 0.00022144615650177002, + "learning_rate": 9.274017555754407e-07, + "loss": 0.0564, + "reward": 0.36173775792121887, + "reward_std": 0.7157497424632311, + "rewards/accuracy_reward": 0.31250000931322575, + "rewards/cosine_scaled_reward": 0.04923774115741253, + "step": 54 + }, + { + "clip_fraction": 0.0, + "completion_length": 3098.687530517578, + "epoch": 0.06285714285714286, + "grad_norm": 0.1664225459098816, + "kl": 0.0008269157260656357, + "learning_rate": 9.230669076497687e-07, + "loss": 0.055, + "reward": 0.33386562392115593, + "reward_std": 0.4932790081948042, + "rewards/accuracy_reward": 0.27083333395421505, + "rewards/cosine_scaled_reward": 0.06303227692842484, + "step": 55 + }, + { + "clip_fraction": 0.0, + "completion_length": 2980.1667098999023, + "epoch": 0.064, + "grad_norm": 0.22248487174510956, + "kl": 0.0004000365734100342, + "learning_rate": 9.186184199300463e-07, + "loss": 0.0463, + "reward": 0.1612101886421442, + "reward_std": 0.3384838867932558, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/cosine_scaled_reward": -0.047123152762651443, + "step": 56 + }, + { + "clip_fraction": 0.0, + "completion_length": 3345.5833740234375, + "epoch": 0.06514285714285714, + "grad_norm": 0.14005281031131744, + "kl": 0.00021963752806186676, + "learning_rate": 9.140576474687263e-07, + "loss": 0.02, + "reward": -0.10464224513270892, + "reward_std": 0.44751227274537086, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/cosine_scaled_reward": -0.1879755798727274, + "step": 57 + }, + { + "clip_fraction": 0.0, + "completion_length": 2249.7917098999023, + "epoch": 0.06628571428571428, + "grad_norm": 0.21027207374572754, + "kl": 0.00269511342048645, + "learning_rate": 9.093859795212817e-07, + "loss": 0.0657, + "reward": 0.3534491113387048, + "reward_std": 0.5972686447203159, + "rewards/accuracy_reward": 0.31250000186264515, + "rewards/cosine_scaled_reward": 0.04094908758997917, + "step": 58 + }, + { + "clip_fraction": 0.0, + "completion_length": 2903.854179382324, + "epoch": 0.06742857142857143, + "grad_norm": 0.205511212348938, + "kl": 0.0010725334286689758, + "learning_rate": 9.046048391230247e-07, + "loss": -0.0834, + "reward": -0.007946477271616459, + "reward_std": 0.4581656437367201, + "rewards/accuracy_reward": 0.12500000186264515, + "rewards/cosine_scaled_reward": -0.13294648192822933, + "step": 59 + }, + { + "clip_fraction": 0.0, + "completion_length": 3034.270866394043, + "epoch": 0.06857142857142857, + "grad_norm": 0.16402272880077362, + "kl": 0.00045564770698547363, + "learning_rate": 8.997156826556369e-07, + "loss": 0.0219, + "reward": -0.010321232955902815, + "reward_std": 0.44436920061707497, + "rewards/accuracy_reward": 0.12500000186264515, + "rewards/cosine_scaled_reward": -0.13532123528420925, + "step": 60 + }, + { + "clip_fraction": 0.0, + "completion_length": 3173.125030517578, + "epoch": 0.06971428571428571, + "grad_norm": 0.15932059288024902, + "kl": 0.0008110031485557556, + "learning_rate": 8.9471999940354e-07, + "loss": 0.0541, + "reward": -0.0845769364386797, + "reward_std": 0.42896613012999296, + "rewards/accuracy_reward": 0.12500000186264515, + "rewards/cosine_scaled_reward": -0.20957693923264742, + "step": 61 + }, + { + "clip_fraction": 0.0, + "completion_length": 2611.666717529297, + "epoch": 0.07085714285714285, + "grad_norm": 0.2158014327287674, + "kl": 0.07185456156730652, + "learning_rate": 8.896193111002475e-07, + "loss": 0.0293, + "reward": 0.34560922905802727, + "reward_std": 0.7172879008576274, + "rewards/accuracy_reward": 0.31250000558793545, + "rewards/cosine_scaled_reward": 0.033109224401414394, + "step": 62 + }, + { + "clip_fraction": 0.0, + "completion_length": 2483.729232788086, + "epoch": 0.072, + "grad_norm": 0.17524653673171997, + "kl": 0.003895312547683716, + "learning_rate": 8.844151714648274e-07, + "loss": 0.0736, + "reward": 0.3518992383033037, + "reward_std": 0.4679460935294628, + "rewards/accuracy_reward": 0.29166666977107525, + "rewards/cosine_scaled_reward": 0.060232581570744514, + "step": 63 + }, + { + "clip_fraction": 0.0, + "completion_length": 3119.6666870117188, + "epoch": 0.07314285714285715, + "grad_norm": 0.17840062081813812, + "kl": 0.001272439956665039, + "learning_rate": 8.791091657286267e-07, + "loss": 0.0552, + "reward": 0.021638838574290276, + "reward_std": 0.5125578781589866, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/cosine_scaled_reward": -0.12419449165463448, + "step": 64 + }, + { + "clip_fraction": 0.0, + "completion_length": 2688.958366394043, + "epoch": 0.07428571428571429, + "grad_norm": 0.1985544115304947, + "kl": 0.001712799072265625, + "learning_rate": 8.737029101523929e-07, + "loss": -0.0062, + "reward": 0.08414293639361858, + "reward_std": 0.45870870165526867, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/cosine_scaled_reward": -0.10335706360638142, + "step": 65 + }, + { + "clip_fraction": 0.0, + "completion_length": 2155.270839691162, + "epoch": 0.07542857142857143, + "grad_norm": 0.24202020466327667, + "kl": 0.0014235973358154297, + "learning_rate": 8.681980515339463e-07, + "loss": 0.0342, + "reward": 0.2718210807070136, + "reward_std": 0.3149276631884277, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/cosine_scaled_reward": -0.019845602102577686, + "step": 66 + }, + { + "clip_fraction": 0.0, + "completion_length": 3514.5, + "epoch": 0.07657142857142857, + "grad_norm": 0.13506484031677246, + "kl": 0.0011570192873477936, + "learning_rate": 8.625962667065487e-07, + "loss": 0.0197, + "reward": -0.17880290560424328, + "reward_std": 0.3006857465952635, + "rewards/accuracy_reward": 0.0625, + "rewards/cosine_scaled_reward": -0.24130290746688843, + "step": 67 + }, + { + "clip_fraction": 0.0, + "completion_length": 2224.1250228881836, + "epoch": 0.07771428571428571, + "grad_norm": 0.26422637701034546, + "kl": 0.008173942565917969, + "learning_rate": 8.568992620281243e-07, + "loss": 0.1289, + "reward": 0.26997855119407177, + "reward_std": 0.5711956042796373, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/cosine_scaled_reward": 0.019978564232587814, + "step": 68 + }, + { + "clip_fraction": 0.0, + "completion_length": 2771.104202270508, + "epoch": 0.07885714285714286, + "grad_norm": 0.22578439116477966, + "kl": 0.004560351371765137, + "learning_rate": 8.511087728614862e-07, + "loss": -0.0186, + "reward": -0.20487310830503702, + "reward_std": 0.2483804766088724, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/cosine_scaled_reward": -0.22570644319057465, + "step": 69 + }, + { + "clip_fraction": 0.0, + "completion_length": 3143.8333587646484, + "epoch": 0.08, + "grad_norm": 0.2783580422401428, + "kl": 0.0023155808448791504, + "learning_rate": 8.452265630457282e-07, + "loss": 0.0232, + "reward": -0.22550072893500328, + "reward_std": 0.24843326956033707, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/cosine_scaled_reward": -0.24633406475186348, + "step": 70 + }, + { + "clip_fraction": 0.0, + "completion_length": 2701.6666870117188, + "epoch": 0.08114285714285714, + "grad_norm": 0.24257808923721313, + "kl": 0.0030676722526550293, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0883, + "reward": 0.10035606473684311, + "reward_std": 0.5035490561276674, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/cosine_scaled_reward": -0.0663105882704258, + "step": 71 + }, + { + "clip_fraction": 0.0, + "completion_length": 3351.541748046875, + "epoch": 0.08228571428571428, + "grad_norm": 0.1588461697101593, + "kl": 0.00363922119140625, + "learning_rate": 8.331941759724268e-07, + "loss": 0.0452, + "reward": -0.19743284268770367, + "reward_std": 0.27725750021636486, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/cosine_scaled_reward": -0.2182661797851324, + "step": 72 + }, + { + "clip_fraction": 0.0, + "completion_length": 3507.666717529297, + "epoch": 0.08342857142857144, + "grad_norm": 0.1444426029920578, + "kl": 0.0006330609321594238, + "learning_rate": 8.270476638965461e-07, + "loss": 0.0331, + "reward": -0.05398387461900711, + "reward_std": 0.49269056133925915, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/cosine_scaled_reward": -0.15815055184066296, + "step": 73 + }, + { + "clip_fraction": 0.0, + "completion_length": 3344.041717529297, + "epoch": 0.08457142857142858, + "grad_norm": 0.15516585111618042, + "kl": 0.0021051764488220215, + "learning_rate": 8.208167604184217e-07, + "loss": 0.0095, + "reward": 0.008867893368005753, + "reward_std": 0.26112012285739183, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/cosine_scaled_reward": -0.1369654512964189, + "step": 74 + }, + { + "clip_fraction": 0.0, + "completion_length": 2982.4583435058594, + "epoch": 0.08571428571428572, + "grad_norm": 0.13954143226146698, + "kl": 0.011302351951599121, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0333, + "reward": 0.28953575156629086, + "reward_std": 0.3247816953808069, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/cosine_scaled_reward": 0.018702420871704817, + "step": 75 + }, + { + "clip_fraction": 0.0, + "completion_length": 3167.2083587646484, + "epoch": 0.08685714285714285, + "grad_norm": 0.1654420793056488, + "kl": 0.0009011179208755493, + "learning_rate": 8.081093963579707e-07, + "loss": 0.014, + "reward": -0.2234128573909402, + "reward_std": 0.27190146408975124, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/cosine_scaled_reward": -0.24424619507044554, + "step": 76 + }, + { + "clip_fraction": 0.0, + "completion_length": 3288.125030517578, + "epoch": 0.088, + "grad_norm": 0.15181006491184235, + "kl": 0.0010012537240982056, + "learning_rate": 8.01636806561836e-07, + "loss": -0.013, + "reward": 0.1649619173258543, + "reward_std": 0.2921932004392147, + "rewards/accuracy_reward": 0.1875, + "rewards/cosine_scaled_reward": -0.022538071498274803, + "step": 77 + }, + { + "clip_fraction": 0.0, + "completion_length": 3191.187530517578, + "epoch": 0.08914285714285715, + "grad_norm": 0.1391523778438568, + "kl": 0.0008662641048431396, + "learning_rate": 7.950875657567621e-07, + "loss": 0.0498, + "reward": 0.156300887465477, + "reward_std": 0.28907839488238096, + "rewards/accuracy_reward": 0.1875, + "rewards/cosine_scaled_reward": -0.031199105083942413, + "step": 78 + }, + { + "clip_fraction": 0.0, + "completion_length": 2525.6459159851074, + "epoch": 0.09028571428571429, + "grad_norm": 0.22027799487113953, + "kl": 0.0032279491424560547, + "learning_rate": 7.884636689049422e-07, + "loss": -0.0856, + "reward": 0.27039410918951035, + "reward_std": 0.4748187121003866, + "rewards/accuracy_reward": 0.29166666977107525, + "rewards/cosine_scaled_reward": -0.021272567100822926, + "step": 79 + }, + { + "clip_fraction": 0.0, + "completion_length": 3310.000030517578, + "epoch": 0.09142857142857143, + "grad_norm": 0.15355749428272247, + "kl": 0.0017953217029571533, + "learning_rate": 7.817671337095244e-07, + "loss": 0.0254, + "reward": 0.15592915751039982, + "reward_std": 0.45574192702770233, + "rewards/accuracy_reward": 0.20833334140479565, + "rewards/cosine_scaled_reward": -0.05240418389439583, + "step": 80 + }, + { + "clip_fraction": 0.0, + "completion_length": 3161.375030517578, + "epoch": 0.09257142857142857, + "grad_norm": 0.20919400453567505, + "kl": 0.004012107849121094, + "learning_rate": 7.75e-07, + "loss": 0.0254, + "reward": -0.12468860670924187, + "reward_std": 0.35988178849220276, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/cosine_scaled_reward": -0.1871886090375483, + "step": 81 + }, + { + "clip_fraction": 0.0, + "completion_length": 2771.8958435058594, + "epoch": 0.09371428571428571, + "grad_norm": 0.184526726603508, + "kl": 0.0013185292482376099, + "learning_rate": 7.681643291108517e-07, + "loss": 0.0203, + "reward": 0.25131775764748454, + "reward_std": 0.3994978480041027, + "rewards/accuracy_reward": 0.2500000037252903, + "rewards/cosine_scaled_reward": 0.0013177692890167236, + "step": 82 + }, + { + "clip_fraction": 0.0, + "completion_length": 2935.8958435058594, + "epoch": 0.09485714285714286, + "grad_norm": 0.21173416078090668, + "kl": 0.0016698837280273438, + "learning_rate": 7.612622032536507e-07, + "loss": 0.0217, + "reward": -0.1123672490939498, + "reward_std": 0.27851989958435297, + "rewards/accuracy_reward": 0.0625, + "rewards/cosine_scaled_reward": -0.17486725561320782, + "step": 83 + }, + { + "clip_fraction": 0.0, + "completion_length": 3135.541702270508, + "epoch": 0.096, + "grad_norm": 0.16200174391269684, + "kl": 0.0007222890853881836, + "learning_rate": 7.54295724882796e-07, + "loss": 0.0357, + "reward": 0.19974466552957892, + "reward_std": 0.48104337602853775, + "rewards/accuracy_reward": 0.25000000186264515, + "rewards/cosine_scaled_reward": -0.05025534983724356, + "step": 84 + }, + { + "clip_fraction": 0.0, + "completion_length": 3218.375045776367, + "epoch": 0.09714285714285714, + "grad_norm": 0.15586286783218384, + "kl": 0.0010872483253479004, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0099, + "reward": 0.09634784981608391, + "reward_std": 0.6995302718132734, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/cosine_scaled_reward": -0.09115215530619025, + "step": 85 + }, + { + "clip_fraction": 0.0, + "completion_length": 3214.8958587646484, + "epoch": 0.09828571428571428, + "grad_norm": 0.16115757822990417, + "kl": 0.0023956298828125, + "learning_rate": 7.401782177833147e-07, + "loss": 0.0367, + "reward": 0.044234735891222954, + "reward_std": 0.4710828922688961, + "rewards/accuracy_reward": 0.14583333767950535, + "rewards/cosine_scaled_reward": -0.10159859777195379, + "step": 86 + }, + { + "clip_fraction": 0.0, + "completion_length": 3030.041748046875, + "epoch": 0.09942857142857142, + "grad_norm": 0.23102359473705292, + "kl": 0.007025090977549553, + "learning_rate": 7.330314893841101e-07, + "loss": 0.1063, + "reward": 0.1451429482549429, + "reward_std": 0.5002225376665592, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/cosine_scaled_reward": -0.0631904061883688, + "step": 87 + }, + { + "clip_fraction": 0.0, + "completion_length": 3002.729248046875, + "epoch": 0.10057142857142858, + "grad_norm": 0.19206282496452332, + "kl": 0.008913278579711914, + "learning_rate": 7.258290078201731e-07, + "loss": 0.0245, + "reward": 0.16737306024879217, + "reward_std": 0.46823734790086746, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/cosine_scaled_reward": -0.040960275335237384, + "step": 88 + }, + { + "clip_fraction": 0.0, + "completion_length": 3434.5208435058594, + "epoch": 0.10171428571428572, + "grad_norm": 0.15944458544254303, + "kl": 0.0026093721389770508, + "learning_rate": 7.185729670371604e-07, + "loss": 0.0369, + "reward": 0.0876035988330841, + "reward_std": 0.35983937978744507, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/cosine_scaled_reward": -0.07906308211386204, + "step": 89 + }, + { + "clip_fraction": 0.0, + "completion_length": 2749.937515258789, + "epoch": 0.10285714285714286, + "grad_norm": 0.3216039538383484, + "kl": 0.008787989616394043, + "learning_rate": 7.11265577295385e-07, + "loss": -0.007, + "reward": -0.27455414831638336, + "reward_std": 0.2550176875665784, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/cosine_scaled_reward": -0.29538749624043703, + "step": 90 + }, + { + "clip_fraction": 0.0, + "completion_length": 3049.4166870117188, + "epoch": 0.104, + "grad_norm": 0.21130341291427612, + "kl": 0.003309965133666992, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0554, + "reward": 0.2680956181138754, + "reward_std": 0.5306598274037242, + "rewards/accuracy_reward": 0.22916667349636555, + "rewards/cosine_scaled_reward": 0.038928942289203405, + "step": 91 + }, + { + "clip_fraction": 0.0, + "completion_length": 3023.8959045410156, + "epoch": 0.10514285714285715, + "grad_norm": 0.22417984902858734, + "kl": 0.0150984525680542, + "learning_rate": 6.965056695057204e-07, + "loss": 0.0827, + "reward": 0.06401701085269451, + "reward_std": 0.4534417111426592, + "rewards/accuracy_reward": 0.16666666977107525, + "rewards/cosine_scaled_reward": -0.10264966636896133, + "step": 92 + }, + { + "clip_fraction": 0.0, + "completion_length": 3390.0, + "epoch": 0.10628571428571429, + "grad_norm": 0.16805854439735413, + "kl": 0.002878427505493164, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0208, + "reward": -0.25338721089065075, + "reward_std": 0.18274102546274662, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.25338720716536045, + "step": 93 + }, + { + "clip_fraction": 0.0, + "completion_length": 3132.7291870117188, + "epoch": 0.10742857142857143, + "grad_norm": 0.23390114307403564, + "kl": 0.004430115222930908, + "learning_rate": 6.815672671252315e-07, + "loss": 0.0391, + "reward": 0.06390632130205631, + "reward_std": 0.2537277704104781, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/cosine_scaled_reward": -0.10276034381240606, + "step": 94 + }, + { + "clip_fraction": 0.0, + "completion_length": 3494.0208435058594, + "epoch": 0.10857142857142857, + "grad_norm": 0.13700152933597565, + "kl": 0.0011189579963684082, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0066, + "reward": -0.24112762324512005, + "reward_std": 0.25422961357980967, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/cosine_scaled_reward": -0.2619609571993351, + "step": 95 + }, + { + "clip_fraction": 0.0, + "completion_length": 3054.812530517578, + "epoch": 0.10971428571428571, + "grad_norm": 0.1651657223701477, + "kl": 0.00439077615737915, + "learning_rate": 6.664685702961344e-07, + "loss": 0.0172, + "reward": 0.043599123833701015, + "reward_std": 0.5089055569842458, + "rewards/accuracy_reward": 0.16666667349636555, + "rewards/cosine_scaled_reward": -0.12306755485769827, + "step": 96 + }, + { + "clip_fraction": 0.0, + "completion_length": 3421.5833740234375, + "epoch": 0.11085714285714286, + "grad_norm": 0.1608337163925171, + "kl": 0.00201570987701416, + "learning_rate": 6.588648530198504e-07, + "loss": 0.0159, + "reward": 0.07680944725871086, + "reward_std": 0.4438377758488059, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/cosine_scaled_reward": -0.13152388390153646, + "step": 97 + }, + { + "clip_fraction": 0.0, + "completion_length": 3099.375045776367, + "epoch": 0.112, + "grad_norm": 0.20414039492607117, + "kl": 0.0015641450881958008, + "learning_rate": 6.512279744547392e-07, + "loss": 0.0324, + "reward": 0.009776812046766281, + "reward_std": 0.18312601745128632, + "rewards/accuracy_reward": 0.125, + "rewards/cosine_scaled_reward": -0.11522318236529827, + "step": 98 + }, + { + "clip_fraction": 0.0, + "completion_length": 2837.979217529297, + "epoch": 0.11314285714285714, + "grad_norm": 0.21932294964790344, + "kl": 0.0025873184204101562, + "learning_rate": 6.435602608679916e-07, + "loss": -0.043, + "reward": 0.07655952998902649, + "reward_std": 0.24549889098852873, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/cosine_scaled_reward": -0.06927378475666046, + "step": 99 + }, + { + "clip_fraction": 0.0, + "completion_length": 3029.0208587646484, + "epoch": 0.11428571428571428, + "grad_norm": 0.17687632143497467, + "kl": 0.005702972412109375, + "learning_rate": 6.358640479194451e-07, + "loss": 0.0584, + "reward": 0.3614894598722458, + "reward_std": 0.4475015373900533, + "rewards/accuracy_reward": 0.3125000074505806, + "rewards/cosine_scaled_reward": 0.048989441245794296, + "step": 100 + }, + { + "clip_fraction": 0.0, + "completion_length": 3111.666702270508, + "epoch": 0.11542857142857142, + "grad_norm": 0.19027626514434814, + "kl": 0.0019845962524414062, + "learning_rate": 6.281416799501187e-07, + "loss": 0.025, + "reward": 0.06912354379892349, + "reward_std": 0.28665477968752384, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/cosine_scaled_reward": -0.09754312154836953, + "step": 101 + }, + { + "clip_fraction": 0.0, + "completion_length": 2907.312530517578, + "epoch": 0.11657142857142858, + "grad_norm": 0.2261529117822647, + "kl": 0.006736278533935547, + "learning_rate": 6.203955092681039e-07, + "loss": 0.0423, + "reward": 0.11842460185289383, + "reward_std": 0.49445752426981926, + "rewards/accuracy_reward": 0.16666667349636555, + "rewards/cosine_scaled_reward": -0.0482420613989234, + "step": 102 + }, + { + "clip_fraction": 0.0, + "completion_length": 3219.6458587646484, + "epoch": 0.11771428571428572, + "grad_norm": 0.20831391215324402, + "kl": 0.004827260971069336, + "learning_rate": 6.126278954320294e-07, + "loss": 0.065, + "reward": -0.018325381679460406, + "reward_std": 0.4717335421591997, + "rewards/accuracy_reward": 0.12500000186264515, + "rewards/cosine_scaled_reward": -0.14332537725567818, + "step": 103 + }, + { + "clip_fraction": 0.0, + "completion_length": 2912.9791870117188, + "epoch": 0.11885714285714286, + "grad_norm": 0.17986515164375305, + "kl": 0.034677982330322266, + "learning_rate": 6.048412045323164e-07, + "loss": 0.0421, + "reward": -0.11950392462313175, + "reward_std": 0.2809145264327526, + "rewards/accuracy_reward": 0.0625, + "rewards/cosine_scaled_reward": -0.18200392089784145, + "step": 104 + }, + { + "clip_fraction": 0.0, + "completion_length": 3010.8958435058594, + "epoch": 0.12, + "grad_norm": 0.21148350834846497, + "kl": 0.0034750699996948242, + "learning_rate": 5.97037808470444e-07, + "loss": 0.012, + "reward": 0.14130431599915028, + "reward_std": 0.47796807438135147, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/cosine_scaled_reward": -0.04619568458292633, + "step": 105 + }, + { + "clip_fraction": 0.0, + "completion_length": 2606.0416870117188, + "epoch": 0.12114285714285715, + "grad_norm": 0.18517449498176575, + "kl": 0.014556169509887695, + "learning_rate": 5.892200842364462e-07, + "loss": 0.0554, + "reward": 0.554547032341361, + "reward_std": 0.45337705919519067, + "rewards/accuracy_reward": 0.41666666977107525, + "rewards/cosine_scaled_reward": 0.1378803295083344, + "step": 106 + }, + { + "clip_fraction": 0.0, + "completion_length": 3122.312515258789, + "epoch": 0.12228571428571429, + "grad_norm": 0.2288726419210434, + "kl": 0.0027909278869628906, + "learning_rate": 5.813904131848564e-07, + "loss": 0.0421, + "reward": 0.014997333288192749, + "reward_std": 0.35810419358313084, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/cosine_scaled_reward": -0.1308360043913126, + "step": 107 + }, + { + "clip_fraction": 0.0, + "completion_length": 3150.2083587646484, + "epoch": 0.12342857142857143, + "grad_norm": 0.22030171751976013, + "kl": 0.0030778050422668457, + "learning_rate": 5.735511803093248e-07, + "loss": 0.0993, + "reward": 0.12523731286637485, + "reward_std": 0.47955003939568996, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/cosine_scaled_reward": -0.06226269342005253, + "step": 108 + }, + { + "clip_fraction": 0.0, + "completion_length": 3190.062530517578, + "epoch": 0.12457142857142857, + "grad_norm": 0.16835717856884003, + "kl": 0.001190185546875, + "learning_rate": 5.657047735161255e-07, + "loss": 0.0296, + "reward": 0.11127902567386627, + "reward_std": 0.30289303325116634, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/cosine_scaled_reward": -0.05538764409720898, + "step": 109 + }, + { + "clip_fraction": 0.0, + "completion_length": 3129.8750762939453, + "epoch": 0.12571428571428572, + "grad_norm": 0.18525457382202148, + "kl": 0.0027114152908325195, + "learning_rate": 5.578535828967777e-07, + "loss": 0.0241, + "reward": 0.1006783633492887, + "reward_std": 0.5584467053413391, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/cosine_scaled_reward": -0.08682164316996932, + "step": 110 + }, + { + "clip_fraction": 0.0, + "completion_length": 3500.687530517578, + "epoch": 0.12685714285714286, + "grad_norm": 0.16640202701091766, + "kl": 0.003269195556640625, + "learning_rate": 5.5e-07, + "loss": 0.0407, + "reward": 0.06599505990743637, + "reward_std": 0.5910971527919173, + "rewards/accuracy_reward": 0.16666666977107525, + "rewards/cosine_scaled_reward": -0.10067159991012886, + "step": 111 + }, + { + "clip_fraction": 0.0, + "completion_length": 3325.8125610351562, + "epoch": 0.128, + "grad_norm": 0.16706104576587677, + "kl": 0.0014832019805908203, + "learning_rate": 5.421464171032224e-07, + "loss": 0.0229, + "reward": 0.19466983899474144, + "reward_std": 0.5429200492799282, + "rewards/accuracy_reward": 0.2291666753590107, + "rewards/cosine_scaled_reward": -0.03449681680649519, + "step": 112 + }, + { + "clip_fraction": 0.0, + "completion_length": 3154.7916870117188, + "epoch": 0.12914285714285714, + "grad_norm": 0.18158937990665436, + "kl": 0.011591911315917969, + "learning_rate": 5.342952264838747e-07, + "loss": 0.0169, + "reward": 0.09023613715544343, + "reward_std": 0.4198597203940153, + "rewards/accuracy_reward": 0.16666667349636555, + "rewards/cosine_scaled_reward": -0.07643054611980915, + "step": 113 + }, + { + "clip_fraction": 0.0, + "completion_length": 2874.5000381469727, + "epoch": 0.13028571428571428, + "grad_norm": 0.15838727355003357, + "kl": 0.003212451934814453, + "learning_rate": 5.264488196906752e-07, + "loss": -0.0056, + "reward": -0.14509500935673714, + "reward_std": 0.27863842714577913, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/cosine_scaled_reward": -0.22842834889888763, + "step": 114 + }, + { + "clip_fraction": 0.0, + "completion_length": 2977.4791870117188, + "epoch": 0.13142857142857142, + "grad_norm": 0.20102404057979584, + "kl": 0.005918979644775391, + "learning_rate": 5.186095868151436e-07, + "loss": 0.0203, + "reward": 0.08041055500507355, + "reward_std": 0.34110954217612743, + "rewards/accuracy_reward": 0.1875, + "rewards/cosine_scaled_reward": -0.10708944126963615, + "step": 115 + }, + { + "clip_fraction": 0.0, + "completion_length": 3477.7708740234375, + "epoch": 0.13257142857142856, + "grad_norm": 0.1574009358882904, + "kl": 0.002073526382446289, + "learning_rate": 5.107799157635538e-07, + "loss": -0.0103, + "reward": 0.10368268750607967, + "reward_std": 0.2848002705723047, + "rewards/accuracy_reward": 0.1875, + "rewards/cosine_scaled_reward": -0.08381731063127518, + "step": 116 + }, + { + "clip_fraction": 0.0, + "completion_length": 3352.8125, + "epoch": 0.1337142857142857, + "grad_norm": 0.2221246063709259, + "kl": 0.003772258758544922, + "learning_rate": 5.02962191529556e-07, + "loss": 0.046, + "reward": -0.11389691138174385, + "reward_std": 0.4279285650700331, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/cosine_scaled_reward": -0.17639691312797368, + "step": 117 + }, + { + "clip_fraction": 0.0, + "completion_length": 3320.604248046875, + "epoch": 0.13485714285714287, + "grad_norm": 0.19471098482608795, + "kl": 0.002338886260986328, + "learning_rate": 4.951587954676837e-07, + "loss": 0.0529, + "reward": 0.3609738126397133, + "reward_std": 0.551423080265522, + "rewards/accuracy_reward": 0.31250000558793545, + "rewards/cosine_scaled_reward": 0.04847381100989878, + "step": 118 + }, + { + "clip_fraction": 0.0, + "completion_length": 2834.875030517578, + "epoch": 0.136, + "grad_norm": 0.18936385214328766, + "kl": 0.013357400894165039, + "learning_rate": 4.873721045679706e-07, + "loss": 0.0535, + "reward": -0.012741273269057274, + "reward_std": 0.41901483573019505, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/cosine_scaled_reward": -0.13774127699434757, + "step": 119 + }, + { + "clip_fraction": 0.0, + "completion_length": 2937.7916984558105, + "epoch": 0.13714285714285715, + "grad_norm": 0.24001307785511017, + "kl": 0.007732391357421875, + "learning_rate": 4.79604490731896e-07, + "loss": -0.0484, + "reward": -0.006065825000405312, + "reward_std": 0.15845976071432233, + "rewards/accuracy_reward": 0.125, + "rewards/cosine_scaled_reward": -0.13106583431363106, + "step": 120 + }, + { + "clip_fraction": 0.0, + "completion_length": 2497.1042098999023, + "epoch": 0.1382857142857143, + "grad_norm": 0.17908261716365814, + "kl": 0.026035308837890625, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.0151, + "reward": 0.11876576207578182, + "reward_std": 0.3727921061217785, + "rewards/accuracy_reward": 0.18750000186264515, + "rewards/cosine_scaled_reward": -0.06873424537479877, + "step": 121 + }, + { + "clip_fraction": 0.0, + "completion_length": 3167.729217529297, + "epoch": 0.13942857142857143, + "grad_norm": 0.20142368972301483, + "kl": 0.0050547122955322266, + "learning_rate": 4.641359520805548e-07, + "loss": 0.0011, + "reward": 0.2649596631526947, + "reward_std": 0.5385044906288385, + "rewards/accuracy_reward": 0.25000000931322575, + "rewards/cosine_scaled_reward": 0.014959652442485094, + "step": 122 + }, + { + "clip_fraction": 0.0, + "completion_length": 3255.562530517578, + "epoch": 0.14057142857142857, + "grad_norm": 0.15558990836143494, + "kl": 0.0027800798416137695, + "learning_rate": 4.5643973913200837e-07, + "loss": 0.0281, + "reward": -0.0696017425507307, + "reward_std": 0.36961269890889525, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/cosine_scaled_reward": -0.17376840766519308, + "step": 123 + }, + { + "clip_fraction": 0.0, + "completion_length": 2784.2709045410156, + "epoch": 0.1417142857142857, + "grad_norm": 0.2072015255689621, + "kl": 0.026355981826782227, + "learning_rate": 4.4877202554526084e-07, + "loss": 0.0621, + "reward": 0.23046518303453922, + "reward_std": 0.6176506169140339, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/cosine_scaled_reward": -0.019534816965460777, + "step": 124 + }, + { + "clip_fraction": 0.0, + "completion_length": 2942.375, + "epoch": 0.14285714285714285, + "grad_norm": 0.17982633411884308, + "kl": 0.07889890670776367, + "learning_rate": 4.4113514698014953e-07, + "loss": -0.0356, + "reward": 0.20839283242821693, + "reward_std": 0.37462584115564823, + "rewards/accuracy_reward": 0.22916666977107525, + "rewards/cosine_scaled_reward": -0.02077384665608406, + "step": 125 + }, + { + "clip_fraction": 0.0, + "completion_length": 2955.916702270508, + "epoch": 0.144, + "grad_norm": 0.16790178418159485, + "kl": 0.0015211105346679688, + "learning_rate": 4.3353142970386557e-07, + "loss": 0.0345, + "reward": 0.08628061786293983, + "reward_std": 0.33532324619591236, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/cosine_scaled_reward": -0.12205271050333977, + "step": 126 + }, + { + "clip_fraction": 0.0, + "completion_length": 3365.2083435058594, + "epoch": 0.14514285714285713, + "grad_norm": 0.24060513079166412, + "kl": 0.0030236244201660156, + "learning_rate": 4.2596318988235037e-07, + "loss": 0.0356, + "reward": -0.12239565665367991, + "reward_std": 0.304832368157804, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/cosine_scaled_reward": -0.16406232165172696, + "step": 127 + }, + { + "clip_fraction": 0.0, + "completion_length": 3064.354217529297, + "epoch": 0.1462857142857143, + "grad_norm": 0.17565199732780457, + "kl": 0.004384756088256836, + "learning_rate": 4.1843273287476854e-07, + "loss": 0.0906, + "reward": 0.28212980553507805, + "reward_std": 0.6925715450197458, + "rewards/accuracy_reward": 0.27083333767950535, + "rewards/cosine_scaled_reward": 0.011296482291072607, + "step": 128 + }, + { + "clip_fraction": 0.0, + "completion_length": 3541.0625, + "epoch": 0.14742857142857144, + "grad_norm": 0.15770180523395538, + "kl": 0.0032634735107421875, + "learning_rate": 4.1094235253127374e-07, + "loss": 0.0054, + "reward": -0.060815065167844296, + "reward_std": 0.25299688428640366, + "rewards/accuracy_reward": 0.1041666716337204, + "rewards/cosine_scaled_reward": -0.1649817330762744, + "step": 129 + }, + { + "clip_fraction": 0.0, + "completion_length": 3418.1458435058594, + "epoch": 0.14857142857142858, + "grad_norm": 0.16763123869895935, + "kl": 0.0033943653106689453, + "learning_rate": 4.034943304942796e-07, + "loss": 0.0367, + "reward": 0.07925572944805026, + "reward_std": 0.4248347468674183, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/cosine_scaled_reward": -0.06657761428505182, + "step": 130 + }, + { + "clip_fraction": 0.0, + "completion_length": 3062.2292098999023, + "epoch": 0.14971428571428572, + "grad_norm": 0.169542133808136, + "kl": 0.019721031188964844, + "learning_rate": 3.9609093550344907e-07, + "loss": -0.0339, + "reward": 0.32605051435530186, + "reward_std": 0.5189148262143135, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/cosine_scaled_reward": 0.034383836667984724, + "step": 131 + }, + { + "clip_fraction": 0.0, + "completion_length": 3312.1875610351562, + "epoch": 0.15085714285714286, + "grad_norm": 0.1781223863363266, + "kl": 0.0033979415893554688, + "learning_rate": 3.8873442270461485e-07, + "loss": 0.0421, + "reward": 0.27052065124735236, + "reward_std": 0.7929235212504864, + "rewards/accuracy_reward": 0.27083334140479565, + "rewards/cosine_scaled_reward": -0.0003126785159111023, + "step": 132 + }, + { + "clip_fraction": 0.0, + "completion_length": 3373.3541870117188, + "epoch": 0.152, + "grad_norm": 0.19108432531356812, + "kl": 0.0029697418212890625, + "learning_rate": 3.8142703296283953e-07, + "loss": -0.0076, + "reward": -0.12809189222753048, + "reward_std": 0.2327743861824274, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/cosine_scaled_reward": -0.2739252429455519, + "step": 133 + }, + { + "clip_fraction": 0.0, + "completion_length": 3061.6666870117188, + "epoch": 0.15314285714285714, + "grad_norm": 0.1890539675951004, + "kl": 0.01691436767578125, + "learning_rate": 3.7417099217982686e-07, + "loss": -0.0643, + "reward": -0.005162534303963184, + "reward_std": 0.384098834823817, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/cosine_scaled_reward": -0.15099586872383952, + "step": 134 + }, + { + "clip_fraction": 0.0, + "completion_length": 2299.3333892822266, + "epoch": 0.15428571428571428, + "grad_norm": 0.2244781255722046, + "kl": 0.014550924301147461, + "learning_rate": 3.6696851061588994e-07, + "loss": 0.0863, + "reward": 0.7467391490936279, + "reward_std": 0.5308007836574689, + "rewards/accuracy_reward": 0.5000000093132257, + "rewards/cosine_scaled_reward": 0.24673915409948677, + "step": 135 + }, + { + "clip_fraction": 0.0, + "completion_length": 3300.187530517578, + "epoch": 0.15542857142857142, + "grad_norm": 0.18033601343631744, + "kl": 0.004876136779785156, + "learning_rate": 3.5982178221668533e-07, + "loss": 0.0506, + "reward": 0.16450288891792297, + "reward_std": 0.5824453700333834, + "rewards/accuracy_reward": 0.20833333767950535, + "rewards/cosine_scaled_reward": -0.04383046319708228, + "step": 136 + }, + { + "clip_fraction": 0.0, + "completion_length": 3515.1041870117188, + "epoch": 0.15657142857142858, + "grad_norm": 0.14998017251491547, + "kl": 0.005837440490722656, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0241, + "reward": -0.27306424523703754, + "reward_std": 0.2572753308340907, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/cosine_scaled_reward": -0.29389757476747036, + "step": 137 + }, + { + "clip_fraction": 0.0, + "completion_length": 3021.7083740234375, + "epoch": 0.15771428571428572, + "grad_norm": 0.17564791440963745, + "kl": 0.0214078426361084, + "learning_rate": 3.45704275117204e-07, + "loss": -0.0187, + "reward": 0.028833418153226376, + "reward_std": 0.3300330266356468, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/cosine_scaled_reward": -0.09616657719016075, + "step": 138 + }, + { + "clip_fraction": 0.0, + "completion_length": 3443.187530517578, + "epoch": 0.15885714285714286, + "grad_norm": 0.17364783585071564, + "kl": 0.003993988037109375, + "learning_rate": 3.387377967463493e-07, + "loss": -0.0154, + "reward": -0.1509188860654831, + "reward_std": 0.3400895958766341, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/cosine_scaled_reward": -0.19258555606938899, + "step": 139 + }, + { + "clip_fraction": 0.0, + "completion_length": 3458.5833740234375, + "epoch": 0.16, + "grad_norm": 0.2179875522851944, + "kl": 0.009866714477539062, + "learning_rate": 3.3183567088914833e-07, + "loss": -0.0423, + "reward": 0.11468263063579798, + "reward_std": 0.3303603548556566, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/cosine_scaled_reward": -0.05198403773829341, + "step": 140 + }, + { + "clip_fraction": 0.0, + "completion_length": 3385.3958740234375, + "epoch": 0.16114285714285714, + "grad_norm": 0.17362990975379944, + "kl": 0.0052642822265625, + "learning_rate": 3.250000000000001e-07, + "loss": -0.0233, + "reward": 0.12504707090556622, + "reward_std": 0.5917222537100315, + "rewards/accuracy_reward": 0.20833333767950535, + "rewards/cosine_scaled_reward": -0.08328627422451973, + "step": 141 + }, + { + "clip_fraction": 0.0, + "completion_length": 3200.6459045410156, + "epoch": 0.16228571428571428, + "grad_norm": 0.19476763904094696, + "kl": 0.005651950836181641, + "learning_rate": 3.182328662904756e-07, + "loss": 0.0829, + "reward": -0.07443058118224144, + "reward_std": 0.4768957942724228, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/cosine_scaled_reward": -0.1994305873522535, + "step": 142 + }, + { + "clip_fraction": 0.0, + "completion_length": 3258.6875610351562, + "epoch": 0.16342857142857142, + "grad_norm": 0.23263806104660034, + "kl": 0.00710296630859375, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0686, + "reward": -0.07686329772695899, + "reward_std": 0.355038208886981, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/cosine_scaled_reward": -0.18102996051311493, + "step": 143 + }, + { + "clip_fraction": 0.0, + "completion_length": 3194.75, + "epoch": 0.16457142857142856, + "grad_norm": 0.17562638223171234, + "kl": 0.0062198638916015625, + "learning_rate": 3.0491243424323783e-07, + "loss": -0.0268, + "reward": -0.06668785959482193, + "reward_std": 0.3318067956715822, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/cosine_scaled_reward": -0.19168786704540253, + "step": 144 + }, + { + "clip_fraction": 0.0, + "completion_length": 2559.2500076293945, + "epoch": 0.1657142857142857, + "grad_norm": 0.24671532213687897, + "kl": 0.009662628173828125, + "learning_rate": 2.9836319343816397e-07, + "loss": -0.0379, + "reward": 0.37313266890123487, + "reward_std": 0.33964273426681757, + "rewards/accuracy_reward": 0.31250000186264515, + "rewards/cosine_scaled_reward": 0.060632668901234865, + "step": 145 + }, + { + "clip_fraction": 0.0, + "completion_length": 3086.375015258789, + "epoch": 0.16685714285714287, + "grad_norm": 0.15435300767421722, + "kl": 0.0024802684783935547, + "learning_rate": 2.918906036420294e-07, + "loss": 0.0125, + "reward": -0.19754038006067276, + "reward_std": 0.2727511157281697, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/cosine_scaled_reward": -0.23920704424381256, + "step": 146 + }, + { + "clip_fraction": 0.0, + "completion_length": 3410.8541870117188, + "epoch": 0.168, + "grad_norm": 0.2101191282272339, + "kl": 0.004631996154785156, + "learning_rate": 2.854966364683872e-07, + "loss": 0.0229, + "reward": -0.13149442267604172, + "reward_std": 0.3892161212861538, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/cosine_scaled_reward": -0.19399441499263048, + "step": 147 + }, + { + "clip_fraction": 0.0, + "completion_length": 2876.1250610351562, + "epoch": 0.16914285714285715, + "grad_norm": 0.18126218020915985, + "kl": 0.008637428283691406, + "learning_rate": 2.791832395815782e-07, + "loss": 0.0202, + "reward": -0.013470180332660675, + "reward_std": 0.2522149607539177, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/cosine_scaled_reward": -0.15930352546274662, + "step": 148 + }, + { + "clip_fraction": 0.0, + "completion_length": 3289.3750610351562, + "epoch": 0.1702857142857143, + "grad_norm": 0.16630330681800842, + "kl": 0.004746913909912109, + "learning_rate": 2.729523361034538e-07, + "loss": 0.0387, + "reward": 0.05808070907369256, + "reward_std": 0.4934985414147377, + "rewards/accuracy_reward": 0.16666666977107525, + "rewards/cosine_scaled_reward": -0.10858596302568913, + "step": 149 + }, + { + "clip_fraction": 0.0, + "completion_length": 3029.5417098999023, + "epoch": 0.17142857142857143, + "grad_norm": 0.16553689539432526, + "kl": 0.007030487060546875, + "learning_rate": 2.6680582402757324e-07, + "loss": 0.0283, + "reward": 0.3249948853626847, + "reward_std": 0.5702541470527649, + "rewards/accuracy_reward": 0.29166667722165585, + "rewards/cosine_scaled_reward": 0.03332819044589996, + "step": 150 + }, + { + "clip_fraction": 0.0, + "completion_length": 3161.3750610351562, + "epoch": 0.17257142857142857, + "grad_norm": 0.18477413058280945, + "kl": 0.005667924880981445, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0607, + "reward": 0.26313829235732555, + "reward_std": 0.5168293453752995, + "rewards/accuracy_reward": 0.25000000186264515, + "rewards/cosine_scaled_reward": 0.013138292357325554, + "step": 151 + }, + { + "clip_fraction": 0.0, + "completion_length": 3020.750011444092, + "epoch": 0.1737142857142857, + "grad_norm": 0.25039243698120117, + "kl": 0.0038559436798095703, + "learning_rate": 2.547734369542718e-07, + "loss": -0.0343, + "reward": -0.17000860534608364, + "reward_std": 0.290488900616765, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/cosine_scaled_reward": -0.21167527325451374, + "step": 152 + }, + { + "clip_fraction": 0.0, + "completion_length": 3283.4583587646484, + "epoch": 0.17485714285714285, + "grad_norm": 0.2211620956659317, + "kl": 0.025887489318847656, + "learning_rate": 2.488912271385139e-07, + "loss": 0.0627, + "reward": -0.1608979816082865, + "reward_std": 0.30768819246441126, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/cosine_scaled_reward": -0.22339799162000418, + "step": 153 + }, + { + "clip_fraction": 0.0, + "completion_length": 3509.625, + "epoch": 0.176, + "grad_norm": 0.15299846231937408, + "kl": 0.0022106170654296875, + "learning_rate": 2.4310073797187573e-07, + "loss": 0.0166, + "reward": 0.2281611803919077, + "reward_std": 0.6786817125976086, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/cosine_scaled_reward": -0.0010055112652480602, + "step": 154 + }, + { + "clip_fraction": 0.0, + "completion_length": 2825.2708587646484, + "epoch": 0.17714285714285713, + "grad_norm": 0.22317463159561157, + "kl": 0.022104263305664062, + "learning_rate": 2.374037332934512e-07, + "loss": -0.0181, + "reward": 0.3611069116741419, + "reward_std": 0.5537243597209454, + "rewards/accuracy_reward": 0.3125000111758709, + "rewards/cosine_scaled_reward": 0.04860687721520662, + "step": 155 + }, + { + "clip_fraction": 0.0, + "completion_length": 3509.3333435058594, + "epoch": 0.1782857142857143, + "grad_norm": 0.20389185845851898, + "kl": 0.004364371299743652, + "learning_rate": 2.3180194846605364e-07, + "loss": 0.0435, + "reward": -0.07293254625983536, + "reward_std": 0.3572534155100584, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/cosine_scaled_reward": -0.15626587718725204, + "step": 156 + }, + { + "clip_fraction": 0.0, + "completion_length": 3416.7291870117188, + "epoch": 0.17942857142857144, + "grad_norm": 0.16730445623397827, + "kl": 0.005460262298583984, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.0057, + "reward": -0.07636097725480795, + "reward_std": 0.2699170485138893, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/cosine_scaled_reward": -0.15969432052224874, + "step": 157 + }, + { + "clip_fraction": 0.0, + "completion_length": 3422.0625610351562, + "epoch": 0.18057142857142858, + "grad_norm": 0.14682160317897797, + "kl": 0.0034723281860351562, + "learning_rate": 2.2089083427137329e-07, + "loss": 0.0179, + "reward": 0.32586812041699886, + "reward_std": 0.3167658653110266, + "rewards/accuracy_reward": 0.25000000558793545, + "rewards/cosine_scaled_reward": 0.07586812693625689, + "step": 158 + }, + { + "clip_fraction": 0.0, + "completion_length": 3442.6041870117188, + "epoch": 0.18171428571428572, + "grad_norm": 0.15924254059791565, + "kl": 0.014659881591796875, + "learning_rate": 2.1558482853517253e-07, + "loss": 0.0161, + "reward": -0.2088061198592186, + "reward_std": 0.26268018409609795, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/cosine_scaled_reward": -0.2296394552104175, + "step": 159 + }, + { + "clip_fraction": 0.0, + "completion_length": 3276.9791870117188, + "epoch": 0.18285714285714286, + "grad_norm": 0.1880086064338684, + "kl": 0.008440017700195312, + "learning_rate": 2.1038068889975259e-07, + "loss": 0.0752, + "reward": 0.1678475480293855, + "reward_std": 0.5140087231993675, + "rewards/accuracy_reward": 0.2291666753590107, + "rewards/cosine_scaled_reward": -0.06131910812109709, + "step": 160 + }, + { + "clip_fraction": 0.0, + "completion_length": 3382.1875610351562, + "epoch": 0.184, + "grad_norm": 0.1801372915506363, + "kl": 0.015121936798095703, + "learning_rate": 2.0528000059645995e-07, + "loss": 0.0314, + "reward": 0.0975832249969244, + "reward_std": 0.4436509357765317, + "rewards/accuracy_reward": 0.16666667349636555, + "rewards/cosine_scaled_reward": -0.06908343941904604, + "step": 161 + }, + { + "clip_fraction": 0.0, + "completion_length": 3415.6875, + "epoch": 0.18514285714285714, + "grad_norm": 0.235793799161911, + "kl": 0.00775146484375, + "learning_rate": 2.0028431734436308e-07, + "loss": 0.0693, + "reward": 0.02907794527709484, + "reward_std": 0.47673553973436356, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/cosine_scaled_reward": -0.11675538681447506, + "step": 162 + }, + { + "clip_fraction": 0.0, + "completion_length": 2728.625045776367, + "epoch": 0.18628571428571428, + "grad_norm": 0.2469216287136078, + "kl": 0.004032135009765625, + "learning_rate": 1.9539516087697517e-07, + "loss": 0.1104, + "reward": 0.4551220340654254, + "reward_std": 0.6010043127462268, + "rewards/accuracy_reward": 0.33333333767950535, + "rewards/cosine_scaled_reward": 0.12178870104253292, + "step": 163 + }, + { + "clip_fraction": 0.0, + "completion_length": 2826.9166870117188, + "epoch": 0.18742857142857142, + "grad_norm": 0.2510416507720947, + "kl": 0.010884761810302734, + "learning_rate": 1.9061402047871833e-07, + "loss": 0.0495, + "reward": 0.29104864224791527, + "reward_std": 0.41844242811203003, + "rewards/accuracy_reward": 0.2708333395421505, + "rewards/cosine_scaled_reward": 0.020215285941958427, + "step": 164 + }, + { + "clip_fraction": 0.0, + "completion_length": 3338.875030517578, + "epoch": 0.18857142857142858, + "grad_norm": 0.17247295379638672, + "kl": 0.005005836486816406, + "learning_rate": 1.8594235253127372e-07, + "loss": 0.0091, + "reward": -0.10422082059085369, + "reward_std": 0.4068410564213991, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/cosine_scaled_reward": -0.18755416944622993, + "step": 165 + }, + { + "clip_fraction": 0.0, + "completion_length": 3450.0833740234375, + "epoch": 0.18971428571428572, + "grad_norm": 0.13382409512996674, + "kl": 0.002165555953979492, + "learning_rate": 1.8138158006995363e-07, + "loss": 0.0106, + "reward": 0.05564507842063904, + "reward_std": 0.5360854268074036, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/cosine_scaled_reward": -0.09018826205283403, + "step": 166 + }, + { + "clip_fraction": 0.0, + "completion_length": 2906.791748046875, + "epoch": 0.19085714285714286, + "grad_norm": 0.17852716147899628, + "kl": 0.008731842041015625, + "learning_rate": 1.7693309235023127e-07, + "loss": 0.0562, + "reward": 0.060366696037817746, + "reward_std": 0.4651718642562628, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/cosine_scaled_reward": -0.08546665217727423, + "step": 167 + }, + { + "clip_fraction": 0.0, + "completion_length": 3323.2291870117188, + "epoch": 0.192, + "grad_norm": 0.14015178382396698, + "kl": 0.00559234619140625, + "learning_rate": 1.7259824442455923e-07, + "loss": 0.0001, + "reward": -0.00011509843170642853, + "reward_std": 0.48929045908153057, + "rewards/accuracy_reward": 0.12500000186264515, + "rewards/cosine_scaled_reward": -0.12511511333286762, + "step": 168 + }, + { + "clip_fraction": 0.0, + "completion_length": 2785.3333587646484, + "epoch": 0.19314285714285714, + "grad_norm": 0.17836594581604004, + "kl": 0.0029582977294921875, + "learning_rate": 1.6837835672960831e-07, + "loss": 0.0415, + "reward": 0.7883107475936413, + "reward_std": 0.31396112497895956, + "rewards/accuracy_reward": 0.5416666679084301, + "rewards/cosine_scaled_reward": 0.24664408713579178, + "step": 169 + }, + { + "clip_fraction": 0.0, + "completion_length": 2772.9375228881836, + "epoch": 0.19428571428571428, + "grad_norm": 0.1806839257478714, + "kl": 0.006674766540527344, + "learning_rate": 1.6427471468404952e-07, + "loss": 0.0447, + "reward": 0.19183126650750637, + "reward_std": 0.3847146909683943, + "rewards/accuracy_reward": 0.2291666679084301, + "rewards/cosine_scaled_reward": -0.03733538277447224, + "step": 170 + }, + { + "clip_fraction": 0.0, + "completion_length": 3159.541717529297, + "epoch": 0.19542857142857142, + "grad_norm": 0.17659954726696014, + "kl": 0.002955913543701172, + "learning_rate": 1.6028856829700258e-07, + "loss": 0.0338, + "reward": 0.261606702581048, + "reward_std": 0.4677132572978735, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/cosine_scaled_reward": 0.011606710031628609, + "step": 171 + }, + { + "clip_fraction": 0.0, + "completion_length": 3158.8958435058594, + "epoch": 0.19657142857142856, + "grad_norm": 0.1956152766942978, + "kl": 0.006173133850097656, + "learning_rate": 1.5642113178727193e-07, + "loss": 0.0097, + "reward": 0.2144802352413535, + "reward_std": 0.46392463706433773, + "rewards/accuracy_reward": 0.2500000037252903, + "rewards/cosine_scaled_reward": -0.035519770928658545, + "step": 172 + }, + { + "clip_fraction": 0.0, + "completion_length": 2472.437530517578, + "epoch": 0.1977142857142857, + "grad_norm": 0.254905641078949, + "kl": 0.008039474487304688, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.0255, + "reward": -0.041594887152314186, + "reward_std": 0.4095227625221014, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/cosine_scaled_reward": -0.16659490356687456, + "step": 173 + }, + { + "clip_fraction": 0.0, + "completion_length": 3101.9584045410156, + "epoch": 0.19885714285714284, + "grad_norm": 0.18561899662017822, + "kl": 0.0075778961181640625, + "learning_rate": 1.4904706411523448e-07, + "loss": 0.035, + "reward": 0.24006008356809616, + "reward_std": 0.5043942555785179, + "rewards/accuracy_reward": 0.22916666977107525, + "rewards/cosine_scaled_reward": 0.010893410071730614, + "step": 174 + }, + { + "clip_fraction": 0.0, + "completion_length": 2890.500030517578, + "epoch": 0.2, + "grad_norm": 0.17110423743724823, + "kl": 0.01293182373046875, + "learning_rate": 1.4554267916537495e-07, + "loss": 0.0116, + "reward": 0.32882001250982285, + "reward_std": 0.2852998124435544, + "rewards/accuracy_reward": 0.3125, + "rewards/cosine_scaled_reward": 0.016320008784532547, + "step": 175 + }, + { + "clip_fraction": 0.0, + "completion_length": 3115.0208892822266, + "epoch": 0.20114285714285715, + "grad_norm": 0.2752262055873871, + "kl": 0.008016586303710938, + "learning_rate": 1.4216149583350755e-07, + "loss": 0.0955, + "reward": 0.15877557545900345, + "reward_std": 0.6403312431648374, + "rewards/accuracy_reward": 0.20833333767950535, + "rewards/cosine_scaled_reward": -0.049557752907276154, + "step": 176 + }, + { + "clip_fraction": 0.0, + "completion_length": 3386.5833435058594, + "epoch": 0.2022857142857143, + "grad_norm": 0.1783076673746109, + "kl": 0.019231557846069336, + "learning_rate": 1.3890454406082956e-07, + "loss": -0.0247, + "reward": 0.05885206814855337, + "reward_std": 0.3794629080221057, + "rewards/accuracy_reward": 0.125, + "rewards/cosine_scaled_reward": -0.06614794302731752, + "step": 177 + }, + { + "clip_fraction": 0.0, + "completion_length": 3214.7291870117188, + "epoch": 0.20342857142857143, + "grad_norm": 0.1741504669189453, + "kl": 0.021752357482910156, + "learning_rate": 1.3577281594640182e-07, + "loss": -0.019, + "reward": 0.19468790292739868, + "reward_std": 0.5135045610368252, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/cosine_scaled_reward": -0.013645432889461517, + "step": 178 + }, + { + "clip_fraction": 0.0, + "completion_length": 3233.4166870117188, + "epoch": 0.20457142857142857, + "grad_norm": 0.16726212203502655, + "kl": 0.0071485042572021484, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.0061, + "reward": -0.21683532558381557, + "reward_std": 0.26457902044057846, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/cosine_scaled_reward": -0.23766866483492777, + "step": 179 + }, + { + "clip_fraction": 0.0, + "completion_length": 2661.5000228881836, + "epoch": 0.2057142857142857, + "grad_norm": 0.30925920605659485, + "kl": 0.015074729919433594, + "learning_rate": 1.2988880807625927e-07, + "loss": 0.0328, + "reward": 0.512437904253602, + "reward_std": 0.387176813557744, + "rewards/accuracy_reward": 0.3958333358168602, + "rewards/cosine_scaled_reward": 0.11660455353558064, + "step": 180 + }, + { + "clip_fraction": 0.0, + "completion_length": 3386.437530517578, + "epoch": 0.20685714285714285, + "grad_norm": 0.2104531228542328, + "kl": 0.0052337646484375, + "learning_rate": 1.2713832064634125e-07, + "loss": 0.0401, + "reward": -0.050248525105416775, + "reward_std": 0.3244944280013442, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/cosine_scaled_reward": -0.15441520046442747, + "step": 181 + }, + { + "clip_fraction": 0.0, + "completion_length": 2823.104179382324, + "epoch": 0.208, + "grad_norm": 0.1794274002313614, + "kl": 0.02399587631225586, + "learning_rate": 1.2451664098030743e-07, + "loss": -0.0603, + "reward": 0.1379284020513296, + "reward_std": 0.4621511232107878, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/cosine_scaled_reward": -0.049571617506444454, + "step": 182 + }, + { + "clip_fraction": 0.0, + "completion_length": 3123.6458435058594, + "epoch": 0.20914285714285713, + "grad_norm": 0.17760169506072998, + "kl": 0.023395538330078125, + "learning_rate": 1.220245676671809e-07, + "loss": 0.0078, + "reward": 0.16959657333791256, + "reward_std": 0.3798244297504425, + "rewards/accuracy_reward": 0.2291666679084301, + "rewards/cosine_scaled_reward": -0.059570083394646645, + "step": 183 + }, + { + "clip_fraction": 0.0, + "completion_length": 3245.2708435058594, + "epoch": 0.2102857142857143, + "grad_norm": 0.19478560984134674, + "kl": 0.005706787109375, + "learning_rate": 1.1966285981663407e-07, + "loss": -0.0293, + "reward": -0.055914007127285004, + "reward_std": 0.2492837980389595, + "rewards/accuracy_reward": 0.1041666716337204, + "rewards/cosine_scaled_reward": -0.1600806824862957, + "step": 184 + }, + { + "clip_fraction": 0.0, + "completion_length": 3321.2708740234375, + "epoch": 0.21142857142857144, + "grad_norm": 0.17917311191558838, + "kl": 0.008937358856201172, + "learning_rate": 1.1743223682775649e-07, + "loss": 0.0092, + "reward": -0.04576564393937588, + "reward_std": 0.39649237506091595, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/cosine_scaled_reward": -0.14993230905383825, + "step": 185 + }, + { + "clip_fraction": 0.0, + "completion_length": 3297.6458740234375, + "epoch": 0.21257142857142858, + "grad_norm": 0.1722106635570526, + "kl": 0.003765106201171875, + "learning_rate": 1.1533337816991931e-07, + "loss": 0.0326, + "reward": 0.4093305990099907, + "reward_std": 0.3512581354007125, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/cosine_scaled_reward": 0.0759972594678402, + "step": 186 + }, + { + "clip_fraction": 0.0, + "completion_length": 3079.541717529297, + "epoch": 0.21371428571428572, + "grad_norm": 0.27707189321517944, + "kl": 0.00933837890625, + "learning_rate": 1.1336692317580158e-07, + "loss": -0.0215, + "reward": -0.08524070866405964, + "reward_std": 0.38815517351031303, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/cosine_scaled_reward": -0.16857404820621014, + "step": 187 + }, + { + "clip_fraction": 0.0, + "completion_length": 3571.2916870117188, + "epoch": 0.21485714285714286, + "grad_norm": 0.1520886868238449, + "kl": 0.0035920143127441406, + "learning_rate": 1.1153347084664419e-07, + "loss": 0.002, + "reward": -0.1857756022363901, + "reward_std": 0.2816652413457632, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/cosine_scaled_reward": -0.20660893432796001, + "step": 188 + }, + { + "clip_fraction": 0.0, + "completion_length": 3236.5208587646484, + "epoch": 0.216, + "grad_norm": 0.19879254698753357, + "kl": 0.006503105163574219, + "learning_rate": 1.0983357966978745e-07, + "loss": 0.0444, + "reward": -0.016374904662370682, + "reward_std": 0.4003318352624774, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/cosine_scaled_reward": -0.1622082181274891, + "step": 189 + }, + { + "clip_fraction": 0.0, + "completion_length": 3082.4583740234375, + "epoch": 0.21714285714285714, + "grad_norm": 0.16634653508663177, + "kl": 0.00673985481262207, + "learning_rate": 1.0826776744855121e-07, + "loss": 0.0302, + "reward": 0.24105202872306108, + "reward_std": 0.5610934533178806, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/cosine_scaled_reward": -0.02978132525458932, + "step": 190 + }, + { + "clip_fraction": 0.0, + "completion_length": 2954.7708740234375, + "epoch": 0.21828571428571428, + "grad_norm": 0.2336689531803131, + "kl": 0.009424209594726562, + "learning_rate": 1.068365111445064e-07, + "loss": 0.121, + "reward": 0.170956801623106, + "reward_std": 0.3355375565588474, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/cosine_scaled_reward": -0.037376536056399345, + "step": 191 + }, + { + "clip_fraction": 0.0, + "completion_length": 3462.2916870117188, + "epoch": 0.21942857142857142, + "grad_norm": 0.1604224145412445, + "kl": 0.0033931732177734375, + "learning_rate": 1.0554024673218806e-07, + "loss": 0.0184, + "reward": -0.08346282877027988, + "reward_std": 0.28180871857330203, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/cosine_scaled_reward": -0.16679617203772068, + "step": 192 + }, + { + "clip_fraction": 0.0, + "completion_length": 3324.479202270508, + "epoch": 0.22057142857142858, + "grad_norm": 0.23563383519649506, + "kl": 0.004616260528564453, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.0425, + "reward": -0.1967187790432945, + "reward_std": 0.2698577819392085, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/cosine_scaled_reward": -0.21755211427807808, + "step": 193 + }, + { + "clip_fraction": 0.0, + "completion_length": 3275.7916870117188, + "epoch": 0.22171428571428572, + "grad_norm": 0.14834217727184296, + "kl": 0.002389192581176758, + "learning_rate": 1.0335423176140511e-07, + "loss": -0.012, + "reward": 0.502600722014904, + "reward_std": 0.4794109221547842, + "rewards/accuracy_reward": 0.3541666679084301, + "rewards/cosine_scaled_reward": 0.14843403454869986, + "step": 194 + }, + { + "clip_fraction": 0.0, + "completion_length": 3356.8541870117188, + "epoch": 0.22285714285714286, + "grad_norm": 0.16246356070041656, + "kl": 0.0031523704528808594, + "learning_rate": 1.0246514708427701e-07, + "loss": 0.0019, + "reward": 0.14120314689353108, + "reward_std": 0.36129687167704105, + "rewards/accuracy_reward": 0.20833333395421505, + "rewards/cosine_scaled_reward": -0.06713018752634525, + "step": 195 + }, + { + "clip_fraction": 0.0, + "completion_length": 3576.2291870117188, + "epoch": 0.224, + "grad_norm": 0.15277273952960968, + "kl": 0.0033049583435058594, + "learning_rate": 1.017123858587145e-07, + "loss": 0.0026, + "reward": -0.04466534685343504, + "reward_std": 0.37323744408786297, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/cosine_scaled_reward": -0.12799868267029524, + "step": 196 + }, + { + "clip_fraction": 0.0, + "completion_length": 2975.020835876465, + "epoch": 0.22514285714285714, + "grad_norm": 0.24708448350429535, + "kl": 0.008154869079589844, + "learning_rate": 1.0109617738307911e-07, + "loss": 0.0281, + "reward": 0.3771647736430168, + "reward_std": 0.6537368502467871, + "rewards/accuracy_reward": 0.2916666753590107, + "rewards/cosine_scaled_reward": 0.08549809700343758, + "step": 197 + }, + { + "clip_fraction": 0.0, + "completion_length": 3131.687530517578, + "epoch": 0.22628571428571428, + "grad_norm": 0.17502129077911377, + "kl": 0.016117095947265625, + "learning_rate": 1.0061670936044178e-07, + "loss": -0.0111, + "reward": 0.12027511559426785, + "reward_std": 0.26299865636974573, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/cosine_scaled_reward": -0.08805821277201176, + "step": 198 + }, + { + "clip_fraction": 0.0, + "completion_length": 3566.75, + "epoch": 0.22742857142857142, + "grad_norm": 0.14533254504203796, + "kl": 0.005496978759765625, + "learning_rate": 1.002741278414069e-07, + "loss": 0.0099, + "reward": -0.23100331239402294, + "reward_std": 0.262982377782464, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/cosine_scaled_reward": -0.25183664448559284, + "step": 199 + }, + { + "clip_fraction": 0.0, + "completion_length": 2976.791717529297, + "epoch": 0.22857142857142856, + "grad_norm": 0.18751339614391327, + "kl": 0.0031397342681884766, + "learning_rate": 1.0006853717962393e-07, + "loss": 0.0796, + "reward": 0.2756543140858412, + "reward_std": 0.398221081122756, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/cosine_scaled_reward": 0.004820965230464935, + "step": 200 + }, + { + "epoch": 0.22857142857142856, + "step": 200, + "total_flos": 0.0, + "train_loss": 0.024151897984629613, + "train_runtime": 32217.4606, + "train_samples_per_second": 0.298, + "train_steps_per_second": 0.006 + } + ], + "logging_steps": 1, + "max_steps": 200, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..57c71d9 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad651e6d54ee51eab745785befc80d3599f5daaaf7e1e016ec102d3685819de6 +size 8760