From 04ef26cf1a63eddb7d5902e5d0bec82535627f97 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Mon, 18 May 2026 22:20:14 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: gshasiri/SmolLM3-DPO-Second-Round Source: Original Platform --- .gitattributes | 36 + README.md | 69 + all_results.json | 9 + chat_template.jinja | 96 + config.json | 36 + generation_config.json | 12 + model.safetensors | 3 + special_tokens_map.json | 17 + tokenizer.json | 3 + tokenizer_config.json | 2063 ++ train_results.json | 9 + trainer_state.json | 75643 ++++++++++++++++++++++++++++++++++++++ training_args.bin | 3 + 13 files changed, 77999 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 all_results.json create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_results.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..bc3ecf6 --- /dev/null +++ b/README.md @@ -0,0 +1,69 @@ +--- +base_model: gshasiri/SmolLM3-SFT-Second-Round +library_name: transformers +model_name: SmolLM3-DPO-Second-Round +tags: +- generated_from_trainer +- trl +- dpo +licence: license +--- + +# Model Card for SmolLM3-DPO-Second-Round + +This model is a fine-tuned version of [gshasiri/SmolLM3-SFT-Second-Round](https://huggingface.co/gshasiri/SmolLM3-SFT-Second-Round). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="gshasiri/SmolLM3-DPO-Second-Round", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/shamanework-pl/huggingface/runs/m3gqcxh6) + + +This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290). + +### Framework versions + +- TRL: 0.25.1 +- Transformers: 4.57.1 +- Pytorch: 2.6.0+cu126 +- Datasets: 4.4.1 +- Tokenizers: 0.22.1 + +## Citations + +Cite DPO as: + +```bibtex +@inproceedings{rafailov2023direct, + title = {{Direct Preference Optimization: Your Language Model is Secretly a Reward Model}}, + author = {Rafael Rafailov and Archit Sharma and Eric Mitchell and Christopher D. Manning and Stefano Ermon and Chelsea Finn}, + year = 2023, + booktitle = {Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023}, + url = {http://papers.nips.cc/paper_files/paper/2023/hash/a85b405ed65c6477a4fe8302b5e06ce7-Abstract-Conference.html}, + editor = {Alice Oh and Tristan Naumann and Amir Globerson and Kate Saenko and Moritz Hardt and Sergey Levine}, +} +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..22d4589 --- /dev/null +++ b/all_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 1.0, + "total_flos": 0.0, + "train_loss": 0.45319234429607314, + "train_runtime": 13193.0651, + "train_samples": 161271, + "train_samples_per_second": 12.224, + "train_steps_per_second": 0.382 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..b481759 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,96 @@ +{# ───── defaults ───── #} +{%- if enable_thinking is not defined -%} +{%- set enable_thinking = true -%} +{%- endif -%} + +{# ───── reasoning mode ───── #} +{%- if enable_thinking -%} + {%- set reasoning_mode = "/think" -%} +{%- else -%} + {%- set reasoning_mode = "/no_think" -%} +{%- endif -%} + +{# ───── header (system message) ───── #} +{{- "<|im_start|>system\n" -}} + +{%- if messages[0].role == "system" -%} + {%- set system_message = messages[0].content -%} + {%- if "/no_think" in system_message -%} + {%- set reasoning_mode = "/no_think" -%} + {%- elif "/think" in system_message -%} + {%- set reasoning_mode = "/think" -%} + {%- endif -%} + {%- set custom_instructions = system_message.replace("/no_think", "").replace("/think", "").rstrip() -%} +{%- endif -%} + +{%- if "/system_override" in system_message -%} + {{- custom_instructions.replace("/system_override", "").rstrip() -}} + {{- "<|im_end|>\n" -}} +{%- else -%} + {{- "## Metadata\n\n" -}} + {{- "Knowledge Cutoff Date: June 2025\n" -}} + {%- set today = strftime_now("%d %B %Y") -%} + {{- "Today Date: " ~ today ~ "\n" -}} + {{- "Reasoning Mode: " + reasoning_mode + "\n\n" -}} + + {{- "## Custom Instructions\n\n" -}} + {%- if custom_instructions -%} + {{- custom_instructions + "\n\n" -}} + {%- elif reasoning_mode == "/think" -%} + {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracking, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: Thought section Solution section. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion.\n\n" -}} + {%- else -%} + {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face.\n\n" -}} + {%- endif -%} + + {{- "## Tools\n\n" -}} + {{- "### XML Tools\n\n" -}} + {%- if tools -%} + {%- set ns = namespace(xml_tool_string="You may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n\n\n") -%} + {%- for tool in tools -%} + {%- set ns.xml_tool_string = ns.xml_tool_string ~ (tool | tojson) ~ "\n" -%} + {%- endfor -%} + {%- set xml_tools = ns.xml_tool_string + "\n\nFor each function call, return a json object with function name and arguments within XML tags." -%} + {%- endif -%} + {%- if xml_tools -%} + {{- xml_tools -}} + {%- else -%} + {{- "None" -}} + {%- endif -%} + {{- "\n\n" -}} + {{- "### Python Tools\n\n" -}} + {%- if python_tools -%} + {{- python_tools -}} + {%- else -%} + {{- "None" -}} + {%- endif -%} + {{- "\n\n" -}} + {{- "<|im_end|>\n" -}} +{%- endif -%} + +{# ───── main loop ───── #} +{%- for message in messages -%} + {%- set content = message.content if message.content is string else "" -%} + {%- if message.role == "user" -%} + {{ "<|im_start|>" + message.role + "\n" + content + "<|im_end|>\n" }} + {%- elif message.role == "assistant" -%} + {% generation %} + {%- if reasoning_mode == "/think" -%} + {{ "<|im_start|>assistant\n" + content.lstrip("\n") + "<|im_end|>\n" }} + {%- else -%} + {{ "<|im_start|>assistant\n" + "\n\n\n" + content.lstrip("\n") + "<|im_end|>\n" }} + {%- endif -%} + {% endgeneration %} + + {%- elif message.role == "tool" -%} + {{ "<|im_start|>" + "user\n" + content + "<|im_end|>\n" }} + {%- endif -%} +{%- endfor -%} + +{# ───── generation prompt ───── #} +{%- if add_generation_prompt -%} + {%- if reasoning_mode == "/think" -%} + {{ "<|im_start|>assistant\n" }} + {%- else -%} + {{ "<|im_start|>assistant\n" + "\n\n\n" }} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..3ecc717 --- /dev/null +++ b/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128012, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128012, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.57.1", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..024647e --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128012 + ], + "pad_token_id": 128012, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.1" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..6c55887 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec688512483a916034870c8d9c38199da0c682cc7e8465c69de6993ee92c4051 +size 2471645608 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..9fed049 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|im_end|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..f342589 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e7c979daf2c715603b21e094ce7e032280b007311a070cdf98ed708c492d614 +size 17209792 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..636c7ef --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128014": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128015": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128016": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128017": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128018": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|im_end|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|im_end|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..22d4589 --- /dev/null +++ b/train_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 1.0, + "total_flos": 0.0, + "train_loss": 0.45319234429607314, + "train_runtime": 13193.0651, + "train_samples": 161271, + "train_samples_per_second": 12.224, + "train_steps_per_second": 0.382 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..27446f5 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,75643 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5040, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00019842254080063495, + "grad_norm": 98.61774046173943, + "learning_rate": 0.0, + "logits/chosen": 4.8046875, + "logits/rejected": 4.96875, + "logps/chosen": -868.0, + "logps/rejected": -556.0, + "loss": 1.0, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0003968450816012699, + "grad_norm": 160.97643146438963, + "learning_rate": 1.984126984126984e-09, + "logits/chosen": 3.84765625, + "logits/rejected": 4.3125, + "logps/chosen": -1256.0, + "logps/rejected": -2156.0, + "loss": 1.0, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.0005952676224019049, + "grad_norm": 96.21385296063366, + "learning_rate": 3.968253968253968e-09, + "logits/chosen": 4.3828125, + "logits/rejected": 4.6953125, + "logps/chosen": -898.0, + "logps/rejected": -631.5, + "loss": 1.0349, + "rewards/accuracies": 0.15625, + "rewards/chosen": -0.1244964599609375, + "rewards/margins": -0.160400390625, + "rewards/rejected": 0.0362091064453125, + "step": 3 + }, + { + "epoch": 0.0007936901632025398, + "grad_norm": 110.08391671160577, + "learning_rate": 5.952380952380952e-09, + "logits/chosen": 4.30078125, + "logits/rejected": 4.625, + "logps/chosen": -1096.0, + "logps/rejected": -773.0, + "loss": 1.0156, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.030487060546875, + "rewards/margins": -0.078094482421875, + "rewards/rejected": 0.10870361328125, + "step": 4 + }, + { + "epoch": 0.0009921127040031748, + "grad_norm": 123.01762291198531, + "learning_rate": 7.936507936507936e-09, + "logits/chosen": 4.5078125, + "logits/rejected": 4.8046875, + "logps/chosen": -1163.5, + "logps/rejected": -716.5, + "loss": 1.0104, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.019977569580078125, + "rewards/margins": -0.044189453125, + "rewards/rejected": 0.02423858642578125, + "step": 5 + }, + { + "epoch": 0.0011905352448038098, + "grad_norm": 212.08028178417675, + "learning_rate": 9.92063492063492e-09, + "logits/chosen": 4.5390625, + "logits/rejected": 4.8359375, + "logps/chosen": -971.5, + "logps/rejected": -1270.0, + "loss": 1.0128, + "rewards/accuracies": 0.28125, + "rewards/chosen": -0.089202880859375, + "rewards/margins": -0.0626220703125, + "rewards/rejected": -0.02655029296875, + "step": 6 + }, + { + "epoch": 0.0013889577856044447, + "grad_norm": 125.74368627090936, + "learning_rate": 1.1904761904761903e-08, + "logits/chosen": 4.42578125, + "logits/rejected": 4.453125, + "logps/chosen": -1079.0, + "logps/rejected": -738.0, + "loss": 0.9874, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.0211181640625, + "rewards/margins": 0.05548095703125, + "rewards/rejected": -0.03438568115234375, + "step": 7 + }, + { + "epoch": 0.0015873803264050796, + "grad_norm": 97.44144978850635, + "learning_rate": 1.3888888888888887e-08, + "logits/chosen": 4.609375, + "logits/rejected": 4.8828125, + "logps/chosen": -750.5, + "logps/rejected": -588.5, + "loss": 0.9838, + "rewards/accuracies": 0.40625, + "rewards/chosen": 0.104400634765625, + "rewards/margins": 0.07318115234375, + "rewards/rejected": 0.031341552734375, + "step": 8 + }, + { + "epoch": 0.0017858028672057145, + "grad_norm": 120.10455632068323, + "learning_rate": 1.5873015873015872e-08, + "logits/chosen": 4.5, + "logits/rejected": 4.6015625, + "logps/chosen": -1223.0, + "logps/rejected": -741.0, + "loss": 0.9929, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.011932373046875, + "rewards/margins": 0.0302734375, + "rewards/rejected": -0.0422515869140625, + "step": 9 + }, + { + "epoch": 0.0019842254080063497, + "grad_norm": 122.99840419092497, + "learning_rate": 1.7857142857142856e-08, + "logits/chosen": 4.4296875, + "logits/rejected": 4.9140625, + "logps/chosen": -1093.0, + "logps/rejected": -673.0, + "loss": 1.0063, + "rewards/accuracies": 0.28125, + "rewards/chosen": -0.03990936279296875, + "rewards/margins": -0.02745819091796875, + "rewards/rejected": -0.0125274658203125, + "step": 10 + }, + { + "epoch": 0.0021826479488069846, + "grad_norm": 148.85980209483702, + "learning_rate": 1.984126984126984e-08, + "logits/chosen": 4.27734375, + "logits/rejected": 4.453125, + "logps/chosen": -838.0, + "logps/rejected": -1425.5, + "loss": 0.9811, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.056304931640625, + "rewards/margins": 0.08530426025390625, + "rewards/rejected": -0.02898406982421875, + "step": 11 + }, + { + "epoch": 0.0023810704896076195, + "grad_norm": 136.1912235563449, + "learning_rate": 2.1825396825396823e-08, + "logits/chosen": 4.5234375, + "logits/rejected": 4.84375, + "logps/chosen": -1222.0, + "logps/rejected": -700.5, + "loss": 1.0233, + "rewards/accuracies": 0.15625, + "rewards/chosen": -0.143402099609375, + "rewards/margins": -0.121002197265625, + "rewards/rejected": -0.0226287841796875, + "step": 12 + }, + { + "epoch": 0.0025794930304082545, + "grad_norm": 116.93876579543387, + "learning_rate": 2.3809523809523807e-08, + "logits/chosen": 4.828125, + "logits/rejected": 4.90625, + "logps/chosen": -969.0, + "logps/rejected": -695.5, + "loss": 0.9747, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.06732177734375, + "rewards/margins": 0.1032257080078125, + "rewards/rejected": -0.03594970703125, + "step": 13 + }, + { + "epoch": 0.0027779155712088894, + "grad_norm": 134.37650831629986, + "learning_rate": 2.579365079365079e-08, + "logits/chosen": 4.1484375, + "logits/rejected": 4.1171875, + "logps/chosen": -1165.0, + "logps/rejected": -1251.0, + "loss": 1.0156, + "rewards/accuracies": 0.15625, + "rewards/chosen": -0.06243896484375, + "rewards/margins": -0.066162109375, + "rewards/rejected": 0.00371551513671875, + "step": 14 + }, + { + "epoch": 0.0029763381120095243, + "grad_norm": 115.2887228181175, + "learning_rate": 2.7777777777777774e-08, + "logits/chosen": 4.5234375, + "logits/rejected": 4.6484375, + "logps/chosen": -787.0, + "logps/rejected": -800.0, + "loss": 0.9865, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.028589248657226562, + "rewards/margins": 0.05511474609375, + "rewards/rejected": -0.0265960693359375, + "step": 15 + }, + { + "epoch": 0.0031747606528101592, + "grad_norm": 98.18349143805, + "learning_rate": 2.9761904761904758e-08, + "logits/chosen": 4.75, + "logits/rejected": 5.2109375, + "logps/chosen": -908.0, + "logps/rejected": -533.0, + "loss": 0.9835, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.051586151123046875, + "rewards/margins": 0.06768798828125, + "rewards/rejected": -0.015977859497070312, + "step": 16 + }, + { + "epoch": 0.003373183193610794, + "grad_norm": 124.78247736524793, + "learning_rate": 3.1746031746031744e-08, + "logits/chosen": 4.8515625, + "logits/rejected": 4.765625, + "logps/chosen": -1053.0, + "logps/rejected": -649.0, + "loss": 1.0288, + "rewards/accuracies": 0.15625, + "rewards/chosen": -0.155029296875, + "rewards/margins": -0.1683349609375, + "rewards/rejected": 0.013275146484375, + "step": 17 + }, + { + "epoch": 0.003571605734411429, + "grad_norm": 122.86197754114865, + "learning_rate": 3.373015873015873e-08, + "logits/chosen": 4.1640625, + "logits/rejected": 4.234375, + "logps/chosen": -1171.0, + "logps/rejected": -1067.0, + "loss": 0.9989, + "rewards/accuracies": 0.21875, + "rewards/chosen": -0.0284881591796875, + "rewards/margins": 0.0041961669921875, + "rewards/rejected": -0.03281211853027344, + "step": 18 + }, + { + "epoch": 0.003770028275212064, + "grad_norm": 285.0368781513562, + "learning_rate": 3.571428571428571e-08, + "logits/chosen": 3.8671875, + "logits/rejected": 4.19140625, + "logps/chosen": -1113.0, + "logps/rejected": -691.0, + "loss": 1.0037, + "rewards/accuracies": 0.28125, + "rewards/chosen": -0.04290771484375, + "rewards/margins": -0.018665313720703125, + "rewards/rejected": -0.024200439453125, + "step": 19 + }, + { + "epoch": 0.003968450816012699, + "grad_norm": 106.95040450944398, + "learning_rate": 3.7698412698412695e-08, + "logits/chosen": 4.6640625, + "logits/rejected": 4.8671875, + "logps/chosen": -1276.0, + "logps/rejected": -658.0, + "loss": 0.9752, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.141357421875, + "rewards/margins": 0.142364501953125, + "rewards/rejected": -0.001148223876953125, + "step": 20 + }, + { + "epoch": 0.004166873356813334, + "grad_norm": 130.53347997628782, + "learning_rate": 3.968253968253968e-08, + "logits/chosen": 4.5546875, + "logits/rejected": 4.5703125, + "logps/chosen": -1153.0, + "logps/rejected": -700.0, + "loss": 1.004, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.059478759765625, + "rewards/margins": -0.0219573974609375, + "rewards/rejected": -0.0375213623046875, + "step": 21 + }, + { + "epoch": 0.004365295897613969, + "grad_norm": 123.24506466810581, + "learning_rate": 4.166666666666666e-08, + "logits/chosen": 4.46875, + "logits/rejected": 4.7890625, + "logps/chosen": -1175.5, + "logps/rejected": -794.5, + "loss": 0.9944, + "rewards/accuracies": 0.21875, + "rewards/chosen": 0.006317138671875, + "rewards/margins": 0.02545166015625, + "rewards/rejected": -0.01910400390625, + "step": 22 + }, + { + "epoch": 0.004563718438414604, + "grad_norm": 98.83491615298671, + "learning_rate": 4.3650793650793646e-08, + "logits/chosen": 4.515625, + "logits/rejected": 4.37109375, + "logps/chosen": -766.5, + "logps/rejected": -600.5, + "loss": 1.0139, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.074432373046875, + "rewards/margins": -0.074615478515625, + "rewards/rejected": 0.000213623046875, + "step": 23 + }, + { + "epoch": 0.004762140979215239, + "grad_norm": 113.54047603699867, + "learning_rate": 4.563492063492063e-08, + "logits/chosen": 4.921875, + "logits/rejected": 4.8515625, + "logps/chosen": -1085.0, + "logps/rejected": -542.5, + "loss": 0.9985, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0376739501953125, + "rewards/margins": 0.0130615234375, + "rewards/rejected": 0.024871826171875, + "step": 24 + }, + { + "epoch": 0.0049605635200158735, + "grad_norm": 117.38987435435085, + "learning_rate": 4.7619047619047613e-08, + "logits/chosen": 3.99609375, + "logits/rejected": 4.0546875, + "logps/chosen": -1310.0, + "logps/rejected": -769.5, + "loss": 0.9916, + "rewards/accuracies": 0.21875, + "rewards/chosen": 0.05928802490234375, + "rewards/margins": 0.0334014892578125, + "rewards/rejected": 0.025390625, + "step": 25 + }, + { + "epoch": 0.005158986060816509, + "grad_norm": 92.42182384307407, + "learning_rate": 4.96031746031746e-08, + "logits/chosen": 4.02734375, + "logits/rejected": 4.16015625, + "logps/chosen": -1062.0, + "logps/rejected": -597.0, + "loss": 1.0173, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.042633056640625, + "rewards/margins": -0.07470703125, + "rewards/rejected": 0.0320892333984375, + "step": 26 + }, + { + "epoch": 0.005357408601617143, + "grad_norm": 111.88074296346883, + "learning_rate": 5.158730158730158e-08, + "logits/chosen": 4.6015625, + "logits/rejected": 4.6171875, + "logps/chosen": -1023.0, + "logps/rejected": -991.5, + "loss": 0.9633, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.1529388427734375, + "rewards/margins": 0.19207763671875, + "rewards/rejected": -0.0392608642578125, + "step": 27 + }, + { + "epoch": 0.005555831142417779, + "grad_norm": 131.1428366705475, + "learning_rate": 5.3571428571428564e-08, + "logits/chosen": 4.5234375, + "logits/rejected": 4.78125, + "logps/chosen": -924.0, + "logps/rejected": -1381.5, + "loss": 1.0237, + "rewards/accuracies": 0.15625, + "rewards/chosen": -0.08043289184570312, + "rewards/margins": -0.10552978515625, + "rewards/rejected": 0.02520751953125, + "step": 28 + }, + { + "epoch": 0.005754253683218413, + "grad_norm": 89.5550142136238, + "learning_rate": 5.555555555555555e-08, + "logits/chosen": 4.890625, + "logits/rejected": 4.9375, + "logps/chosen": -843.0, + "logps/rejected": -488.0, + "loss": 0.9944, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.014007568359375, + "rewards/margins": 0.0294189453125, + "rewards/rejected": -0.015380859375, + "step": 29 + }, + { + "epoch": 0.005952676224019049, + "grad_norm": 586.9464479083147, + "learning_rate": 5.753968253968253e-08, + "logits/chosen": 4.0234375, + "logits/rejected": 4.3203125, + "logps/chosen": -1187.0, + "logps/rejected": -1170.0, + "loss": 0.9856, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.00466156005859375, + "rewards/margins": 0.06207275390625, + "rewards/rejected": -0.05743408203125, + "step": 30 + }, + { + "epoch": 0.006151098764819683, + "grad_norm": 108.68290566650933, + "learning_rate": 5.9523809523809515e-08, + "logits/chosen": 4.01953125, + "logits/rejected": 4.40234375, + "logps/chosen": -1023.0, + "logps/rejected": -747.0, + "loss": 0.993, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.016693115234375, + "rewards/margins": 0.02606201171875, + "rewards/rejected": -0.0095062255859375, + "step": 31 + }, + { + "epoch": 0.0063495213056203184, + "grad_norm": 129.1564168676532, + "learning_rate": 6.15079365079365e-08, + "logits/chosen": 4.109375, + "logits/rejected": 4.41796875, + "logps/chosen": -1111.0, + "logps/rejected": -1098.0, + "loss": 1.005, + "rewards/accuracies": 0.09375, + "rewards/chosen": 0.00543212890625, + "rewards/margins": -0.022705078125, + "rewards/rejected": 0.0281982421875, + "step": 32 + }, + { + "epoch": 0.006547943846420954, + "grad_norm": 87.45258526382078, + "learning_rate": 6.349206349206349e-08, + "logits/chosen": 4.7578125, + "logits/rejected": 4.84765625, + "logps/chosen": -894.0, + "logps/rejected": -528.0, + "loss": 0.99, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.0218963623046875, + "rewards/margins": 0.038238525390625, + "rewards/rejected": -0.0601043701171875, + "step": 33 + }, + { + "epoch": 0.006746366387221588, + "grad_norm": 91.71831681982506, + "learning_rate": 6.547619047619047e-08, + "logits/chosen": 4.671875, + "logits/rejected": 4.6875, + "logps/chosen": -849.0, + "logps/rejected": -490.5, + "loss": 1.0121, + "rewards/accuracies": 0.21875, + "rewards/chosen": -0.0634765625, + "rewards/margins": -0.06121826171875, + "rewards/rejected": -0.0019378662109375, + "step": 34 + }, + { + "epoch": 0.006944788928022224, + "grad_norm": 99.50660472822058, + "learning_rate": 6.746031746031746e-08, + "logits/chosen": 4.80078125, + "logits/rejected": 4.9453125, + "logps/chosen": -785.5, + "logps/rejected": -1025.5, + "loss": 1.0005, + "rewards/accuracies": 0.28125, + "rewards/chosen": -0.00543212890625, + "rewards/margins": -0.00555419921875, + "rewards/rejected": -2.288818359375e-05, + "step": 35 + }, + { + "epoch": 0.007143211468822858, + "grad_norm": 109.009457350833, + "learning_rate": 6.944444444444444e-08, + "logits/chosen": 4.3671875, + "logits/rejected": 4.25390625, + "logps/chosen": -1168.0, + "logps/rejected": -561.5, + "loss": 0.9974, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.03411865234375, + "rewards/margins": 0.00146484375, + "rewards/rejected": -0.0355987548828125, + "step": 36 + }, + { + "epoch": 0.0073416340096234935, + "grad_norm": 122.37673329346393, + "learning_rate": 7.142857142857142e-08, + "logits/chosen": 4.56640625, + "logits/rejected": 4.7578125, + "logps/chosen": -1031.0, + "logps/rejected": -770.0, + "loss": 0.9889, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.014883041381835938, + "rewards/margins": 0.044677734375, + "rewards/rejected": -0.02968597412109375, + "step": 37 + }, + { + "epoch": 0.007540056550424128, + "grad_norm": 102.56877222051489, + "learning_rate": 7.341269841269841e-08, + "logits/chosen": 4.5078125, + "logits/rejected": 4.625, + "logps/chosen": -969.0, + "logps/rejected": -640.5, + "loss": 0.9945, + "rewards/accuracies": 0.28125, + "rewards/chosen": 0.022541046142578125, + "rewards/margins": 0.02294921875, + "rewards/rejected": -0.00067138671875, + "step": 38 + }, + { + "epoch": 0.007738479091224763, + "grad_norm": 134.7172864425327, + "learning_rate": 7.539682539682539e-08, + "logits/chosen": 4.78125, + "logits/rejected": 4.765625, + "logps/chosen": -1166.0, + "logps/rejected": -945.0, + "loss": 0.9751, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.018341064453125, + "rewards/margins": 0.1097412109375, + "rewards/rejected": -0.09149169921875, + "step": 39 + }, + { + "epoch": 0.007936901632025399, + "grad_norm": 90.78541110065206, + "learning_rate": 7.738095238095237e-08, + "logits/chosen": 4.23046875, + "logits/rejected": 4.4765625, + "logps/chosen": -925.0, + "logps/rejected": -821.0, + "loss": 0.9756, + "rewards/accuracies": 0.40625, + "rewards/chosen": 0.075225830078125, + "rewards/margins": 0.100250244140625, + "rewards/rejected": -0.024993896484375, + "step": 40 + }, + { + "epoch": 0.008135324172826032, + "grad_norm": 94.1213628684443, + "learning_rate": 7.936507936507936e-08, + "logits/chosen": 4.5703125, + "logits/rejected": 4.6875, + "logps/chosen": -842.0, + "logps/rejected": -523.0, + "loss": 0.999, + "rewards/accuracies": 0.21875, + "rewards/chosen": 0.0140533447265625, + "rewards/margins": 0.005481719970703125, + "rewards/rejected": 0.008617401123046875, + "step": 41 + }, + { + "epoch": 0.008333746713626668, + "grad_norm": 222.37897690831076, + "learning_rate": 8.134920634920634e-08, + "logits/chosen": 4.4140625, + "logits/rejected": 5.09375, + "logps/chosen": -1070.0, + "logps/rejected": -1287.0, + "loss": 0.9731, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.11236572265625, + "rewards/margins": 0.117431640625, + "rewards/rejected": -0.0047607421875, + "step": 42 + }, + { + "epoch": 0.008532169254427303, + "grad_norm": 132.12722005595515, + "learning_rate": 8.333333333333333e-08, + "logits/chosen": 4.4140625, + "logits/rejected": 4.8046875, + "logps/chosen": -1260.0, + "logps/rejected": -753.0, + "loss": 0.9871, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.001556396484375, + "rewards/margins": 0.050060272216796875, + "rewards/rejected": -0.0516357421875, + "step": 43 + }, + { + "epoch": 0.008730591795227938, + "grad_norm": 89.52985595207755, + "learning_rate": 8.531746031746032e-08, + "logits/chosen": 4.5, + "logits/rejected": 4.6171875, + "logps/chosen": -723.0, + "logps/rejected": -770.0, + "loss": 0.9867, + "rewards/accuracies": 0.28125, + "rewards/chosen": 0.0311431884765625, + "rewards/margins": 0.0533905029296875, + "rewards/rejected": -0.02227783203125, + "step": 44 + }, + { + "epoch": 0.008929014336028572, + "grad_norm": 130.34799014380025, + "learning_rate": 8.730158730158729e-08, + "logits/chosen": 4.21875, + "logits/rejected": 4.4140625, + "logps/chosen": -1576.0, + "logps/rejected": -849.0, + "loss": 0.9999, + "rewards/accuracies": 0.21875, + "rewards/chosen": 0.015625, + "rewards/margins": 0.004638671875, + "rewards/rejected": 0.0109405517578125, + "step": 45 + }, + { + "epoch": 0.009127436876829207, + "grad_norm": 130.4053925332321, + "learning_rate": 8.928571428571429e-08, + "logits/chosen": 4.78125, + "logits/rejected": 4.84375, + "logps/chosen": -1305.0, + "logps/rejected": -910.5, + "loss": 0.9489, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.0625457763671875, + "rewards/margins": 0.2315673828125, + "rewards/rejected": -0.169158935546875, + "step": 46 + }, + { + "epoch": 0.009325859417629843, + "grad_norm": 115.8870655657484, + "learning_rate": 9.126984126984126e-08, + "logits/chosen": 4.59375, + "logits/rejected": 4.625, + "logps/chosen": -1055.0, + "logps/rejected": -720.5, + "loss": 0.9939, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.02569580078125, + "rewards/margins": 0.01641845703125, + "rewards/rejected": -0.04205322265625, + "step": 47 + }, + { + "epoch": 0.009524281958430478, + "grad_norm": 108.2213680723501, + "learning_rate": 9.325396825396826e-08, + "logits/chosen": 4.6015625, + "logits/rejected": 4.671875, + "logps/chosen": -1121.0, + "logps/rejected": -820.0, + "loss": 1.0052, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.0108642578125, + "rewards/margins": -0.0228118896484375, + "rewards/rejected": 0.0117645263671875, + "step": 48 + }, + { + "epoch": 0.009722704499231113, + "grad_norm": 110.02338887003987, + "learning_rate": 9.523809523809523e-08, + "logits/chosen": 4.4609375, + "logits/rejected": 4.578125, + "logps/chosen": -1182.0, + "logps/rejected": -683.0, + "loss": 0.9945, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.009033203125, + "rewards/margins": 0.0225830078125, + "rewards/rejected": -0.03173255920410156, + "step": 49 + }, + { + "epoch": 0.009921127040031747, + "grad_norm": 111.3769744729181, + "learning_rate": 9.722222222222222e-08, + "logits/chosen": 4.875, + "logits/rejected": 4.8046875, + "logps/chosen": -1290.0, + "logps/rejected": -813.0, + "loss": 0.9949, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.04150390625, + "rewards/margins": 0.0107269287109375, + "rewards/rejected": -0.052154541015625, + "step": 50 + }, + { + "epoch": 0.010119549580832382, + "grad_norm": 147.30496970512095, + "learning_rate": 9.92063492063492e-08, + "logits/chosen": 4.7578125, + "logits/rejected": 4.765625, + "logps/chosen": -2008.0, + "logps/rejected": -878.0, + "loss": 0.9659, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.00762939453125, + "rewards/margins": 0.1170654296875, + "rewards/rejected": -0.1094970703125, + "step": 51 + }, + { + "epoch": 0.010317972121633018, + "grad_norm": 92.16462493604149, + "learning_rate": 1.0119047619047619e-07, + "logits/chosen": 4.42578125, + "logits/rejected": 4.578125, + "logps/chosen": -913.0, + "logps/rejected": -656.0, + "loss": 0.9716, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.0125274658203125, + "rewards/margins": 0.115814208984375, + "rewards/rejected": -0.103240966796875, + "step": 52 + }, + { + "epoch": 0.010516394662433653, + "grad_norm": 122.33057118806808, + "learning_rate": 1.0317460317460316e-07, + "logits/chosen": 4.25, + "logits/rejected": 4.70703125, + "logps/chosen": -916.0, + "logps/rejected": -542.0, + "loss": 0.9917, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.008056640625, + "rewards/margins": 0.0322418212890625, + "rewards/rejected": -0.04032135009765625, + "step": 53 + }, + { + "epoch": 0.010714817203234287, + "grad_norm": 98.88922157848386, + "learning_rate": 1.0515873015873016e-07, + "logits/chosen": 4.7109375, + "logits/rejected": 4.8203125, + "logps/chosen": -1015.0, + "logps/rejected": -656.0, + "loss": 0.9846, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0442352294921875, + "rewards/margins": 0.06689453125, + "rewards/rejected": -0.022674560546875, + "step": 54 + }, + { + "epoch": 0.010913239744034922, + "grad_norm": 103.44919619032089, + "learning_rate": 1.0714285714285713e-07, + "logits/chosen": 4.33203125, + "logits/rejected": 4.16796875, + "logps/chosen": -971.0, + "logps/rejected": -653.0, + "loss": 0.9813, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0029296875, + "rewards/margins": 0.081298828125, + "rewards/rejected": -0.0782470703125, + "step": 55 + }, + { + "epoch": 0.011111662284835557, + "grad_norm": 78.98130504160294, + "learning_rate": 1.0912698412698413e-07, + "logits/chosen": 4.625, + "logits/rejected": 4.859375, + "logps/chosen": -731.0, + "logps/rejected": -578.0, + "loss": 0.9863, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.022655487060546875, + "rewards/margins": 0.054107666015625, + "rewards/rejected": -0.0768585205078125, + "step": 56 + }, + { + "epoch": 0.011310084825636193, + "grad_norm": 100.74129083578332, + "learning_rate": 1.111111111111111e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.31640625, + "logps/chosen": -889.0, + "logps/rejected": -1107.0, + "loss": 0.9922, + "rewards/accuracies": 0.21875, + "rewards/chosen": -0.00156402587890625, + "rewards/margins": 0.03287506103515625, + "rewards/rejected": -0.034515380859375, + "step": 57 + }, + { + "epoch": 0.011508507366436826, + "grad_norm": 118.24086961012713, + "learning_rate": 1.1309523809523809e-07, + "logits/chosen": 4.25, + "logits/rejected": 4.57421875, + "logps/chosen": -981.0, + "logps/rejected": -840.0, + "loss": 0.99, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.07513999938964844, + "rewards/margins": 0.034423828125, + "rewards/rejected": -0.109405517578125, + "step": 58 + }, + { + "epoch": 0.011706929907237462, + "grad_norm": 112.20060392269276, + "learning_rate": 1.1507936507936506e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.65625, + "logps/chosen": -909.5, + "logps/rejected": -640.0, + "loss": 0.9901, + "rewards/accuracies": 0.28125, + "rewards/chosen": 0.01088714599609375, + "rewards/margins": 0.046112060546875, + "rewards/rejected": -0.03525543212890625, + "step": 59 + }, + { + "epoch": 0.011905352448038097, + "grad_norm": 202.68976023452385, + "learning_rate": 1.1706349206349206e-07, + "logits/chosen": 4.5078125, + "logits/rejected": 4.94140625, + "logps/chosen": -954.5, + "logps/rejected": -791.0, + "loss": 1.0046, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.089111328125, + "rewards/margins": -0.02606201171875, + "rewards/rejected": -0.063262939453125, + "step": 60 + }, + { + "epoch": 0.012103774988838733, + "grad_norm": 115.45677054156461, + "learning_rate": 1.1904761904761903e-07, + "logits/chosen": 4.6875, + "logits/rejected": 4.62109375, + "logps/chosen": -1150.0, + "logps/rejected": -701.0, + "loss": 0.9529, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.017181396484375, + "rewards/margins": 0.201416015625, + "rewards/rejected": -0.18426513671875, + "step": 61 + }, + { + "epoch": 0.012302197529639366, + "grad_norm": 115.795400925016, + "learning_rate": 1.2103174603174603e-07, + "logits/chosen": 4.7265625, + "logits/rejected": 4.6484375, + "logps/chosen": -1156.0, + "logps/rejected": -699.0, + "loss": 0.9714, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.050811767578125, + "rewards/margins": 0.1163330078125, + "rewards/rejected": -0.16729736328125, + "step": 62 + }, + { + "epoch": 0.012500620070440002, + "grad_norm": 99.09352113578761, + "learning_rate": 1.23015873015873e-07, + "logits/chosen": 4.55859375, + "logits/rejected": 5.0390625, + "logps/chosen": -719.0, + "logps/rejected": -614.0, + "loss": 0.9888, + "rewards/accuracies": 0.21875, + "rewards/chosen": 0.005462646484375, + "rewards/margins": 0.046142578125, + "rewards/rejected": -0.0406494140625, + "step": 63 + }, + { + "epoch": 0.012699042611240637, + "grad_norm": 113.13866629503731, + "learning_rate": 1.25e-07, + "logits/chosen": 4.7265625, + "logits/rejected": 4.8125, + "logps/chosen": -1181.0, + "logps/rejected": -917.0, + "loss": 0.9832, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0007781982421875, + "rewards/margins": 0.06170654296875, + "rewards/rejected": -0.06097412109375, + "step": 64 + }, + { + "epoch": 0.012897465152041272, + "grad_norm": 108.03016723119599, + "learning_rate": 1.2698412698412698e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.8359375, + "logps/chosen": -972.0, + "logps/rejected": -1116.0, + "loss": 0.9578, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00681304931640625, + "rewards/margins": 0.178375244140625, + "rewards/rejected": -0.1715087890625, + "step": 65 + }, + { + "epoch": 0.013095887692841908, + "grad_norm": 122.05341727306755, + "learning_rate": 1.28968253968254e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 4.359375, + "logps/chosen": -1340.0, + "logps/rejected": -823.0, + "loss": 0.9561, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.024200439453125, + "rewards/margins": 0.181884765625, + "rewards/rejected": -0.1575927734375, + "step": 66 + }, + { + "epoch": 0.013294310233642541, + "grad_norm": 94.24464237071676, + "learning_rate": 1.3095238095238095e-07, + "logits/chosen": 4.69921875, + "logits/rejected": 4.8125, + "logps/chosen": -1070.0, + "logps/rejected": -624.0, + "loss": 0.9686, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.0062713623046875, + "rewards/margins": 0.119873046875, + "rewards/rejected": -0.11370849609375, + "step": 67 + }, + { + "epoch": 0.013492732774443177, + "grad_norm": 112.6325991203329, + "learning_rate": 1.3293650793650793e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.34765625, + "logps/chosen": -1201.0, + "logps/rejected": -658.5, + "loss": 0.9628, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04946327209472656, + "rewards/margins": 0.144134521484375, + "rewards/rejected": -0.193359375, + "step": 68 + }, + { + "epoch": 0.013691155315243812, + "grad_norm": 79.73464292216714, + "learning_rate": 1.349206349206349e-07, + "logits/chosen": 4.65625, + "logits/rejected": 4.796875, + "logps/chosen": -658.5, + "logps/rejected": -467.0, + "loss": 0.9819, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.03521728515625, + "rewards/margins": 0.07427978515625, + "rewards/rejected": -0.10955810546875, + "step": 69 + }, + { + "epoch": 0.013889577856044447, + "grad_norm": 111.11993220434202, + "learning_rate": 1.3690476190476192e-07, + "logits/chosen": 4.7265625, + "logits/rejected": 5.2109375, + "logps/chosen": -757.5, + "logps/rejected": -1194.0, + "loss": 0.993, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0528106689453125, + "rewards/margins": 0.02269744873046875, + "rewards/rejected": -0.075592041015625, + "step": 70 + }, + { + "epoch": 0.014088000396845081, + "grad_norm": 123.74240063189019, + "learning_rate": 1.3888888888888888e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.35546875, + "logps/chosen": -1412.0, + "logps/rejected": -1145.0, + "loss": 0.9515, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.097900390625, + "rewards/margins": 0.19635009765625, + "rewards/rejected": -0.2939453125, + "step": 71 + }, + { + "epoch": 0.014286422937645716, + "grad_norm": 189.65070311322168, + "learning_rate": 1.4087301587301586e-07, + "logits/chosen": 4.453125, + "logits/rejected": 4.765625, + "logps/chosen": -960.0, + "logps/rejected": -916.0, + "loss": 0.9565, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.010986328125, + "rewards/margins": 0.1790771484375, + "rewards/rejected": -0.1898193359375, + "step": 72 + }, + { + "epoch": 0.014484845478446352, + "grad_norm": 220.4988558175398, + "learning_rate": 1.4285714285714285e-07, + "logits/chosen": 4.51171875, + "logits/rejected": 4.53125, + "logps/chosen": -973.0, + "logps/rejected": -1126.0, + "loss": 0.9462, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.058624267578125, + "rewards/margins": 0.232177734375, + "rewards/rejected": -0.291015625, + "step": 73 + }, + { + "epoch": 0.014683268019246987, + "grad_norm": 87.22388607002826, + "learning_rate": 1.4484126984126986e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 4.359375, + "logps/chosen": -887.0, + "logps/rejected": -530.5, + "loss": 0.9731, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0665283203125, + "rewards/margins": 0.109375, + "rewards/rejected": -0.1759033203125, + "step": 74 + }, + { + "epoch": 0.01488169056004762, + "grad_norm": 109.8028535459321, + "learning_rate": 1.4682539682539681e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.3515625, + "logps/chosen": -1595.0, + "logps/rejected": -916.0, + "loss": 0.9431, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06219482421875, + "rewards/margins": 0.2423095703125, + "rewards/rejected": -0.304443359375, + "step": 75 + }, + { + "epoch": 0.015080113100848256, + "grad_norm": 82.10789158331998, + "learning_rate": 1.488095238095238e-07, + "logits/chosen": 4.6875, + "logits/rejected": 4.9296875, + "logps/chosen": -971.5, + "logps/rejected": -757.5, + "loss": 0.955, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.16546630859375, + "rewards/margins": 0.201171875, + "rewards/rejected": -0.3671875, + "step": 76 + }, + { + "epoch": 0.015278535641648891, + "grad_norm": 117.3444858743771, + "learning_rate": 1.5079365079365078e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 4.4453125, + "logps/chosen": -887.0, + "logps/rejected": -727.0, + "loss": 0.954, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0660247802734375, + "rewards/margins": 0.190185546875, + "rewards/rejected": -0.256103515625, + "step": 77 + }, + { + "epoch": 0.015476958182449527, + "grad_norm": 95.18076187208315, + "learning_rate": 1.527777777777778e-07, + "logits/chosen": 4.28515625, + "logits/rejected": 4.359375, + "logps/chosen": -1196.0, + "logps/rejected": -654.5, + "loss": 0.9251, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.017675399780273438, + "rewards/margins": 0.3682861328125, + "rewards/rejected": -0.3505859375, + "step": 78 + }, + { + "epoch": 0.01567538072325016, + "grad_norm": 101.89756146705878, + "learning_rate": 1.5476190476190475e-07, + "logits/chosen": 4.453125, + "logits/rejected": 4.5234375, + "logps/chosen": -1125.0, + "logps/rejected": -663.0, + "loss": 0.9528, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.121185302734375, + "rewards/margins": 0.19952392578125, + "rewards/rejected": -0.320556640625, + "step": 79 + }, + { + "epoch": 0.015873803264050797, + "grad_norm": 107.23607253225778, + "learning_rate": 1.5674603174603173e-07, + "logits/chosen": 4.48828125, + "logits/rejected": 4.76171875, + "logps/chosen": -1304.5, + "logps/rejected": -789.5, + "loss": 0.9209, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.024444580078125, + "rewards/margins": 0.39068603515625, + "rewards/rejected": -0.4149169921875, + "step": 80 + }, + { + "epoch": 0.01607222580485143, + "grad_norm": 98.17608597714033, + "learning_rate": 1.5873015873015872e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 4.0078125, + "logps/chosen": -1156.0, + "logps/rejected": -688.0, + "loss": 0.9376, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1099395751953125, + "rewards/margins": 0.2630615234375, + "rewards/rejected": -0.37255859375, + "step": 81 + }, + { + "epoch": 0.016270648345652065, + "grad_norm": 103.08913815308641, + "learning_rate": 1.6071428571428573e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 4.4453125, + "logps/chosen": -966.0, + "logps/rejected": -1050.5, + "loss": 0.9462, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.115966796875, + "rewards/margins": 0.2587890625, + "rewards/rejected": -0.374755859375, + "step": 82 + }, + { + "epoch": 0.016469070886452702, + "grad_norm": 84.10948234921794, + "learning_rate": 1.6269841269841268e-07, + "logits/chosen": 4.6640625, + "logits/rejected": 4.5703125, + "logps/chosen": -847.0, + "logps/rejected": -464.5, + "loss": 0.9526, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09539794921875, + "rewards/margins": 0.19921875, + "rewards/rejected": -0.294189453125, + "step": 83 + }, + { + "epoch": 0.016667493427253335, + "grad_norm": 82.5804023326743, + "learning_rate": 1.6468253968253967e-07, + "logits/chosen": 4.828125, + "logits/rejected": 4.7421875, + "logps/chosen": -984.0, + "logps/rejected": -654.0, + "loss": 0.9202, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.048583984375, + "rewards/margins": 0.411865234375, + "rewards/rejected": -0.46044921875, + "step": 84 + }, + { + "epoch": 0.016865915968053972, + "grad_norm": 96.76457898130536, + "learning_rate": 1.6666666666666665e-07, + "logits/chosen": 4.61328125, + "logits/rejected": 4.84375, + "logps/chosen": -926.0, + "logps/rejected": -806.5, + "loss": 0.943, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12939453125, + "rewards/margins": 0.243408203125, + "rewards/rejected": -0.373046875, + "step": 85 + }, + { + "epoch": 0.017064338508854606, + "grad_norm": 88.07999921526088, + "learning_rate": 1.6865079365079366e-07, + "logits/chosen": 4.3828125, + "logits/rejected": 4.796875, + "logps/chosen": -1140.0, + "logps/rejected": -639.5, + "loss": 0.8893, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.021881103515625, + "rewards/margins": 0.5164794921875, + "rewards/rejected": -0.495361328125, + "step": 86 + }, + { + "epoch": 0.01726276104965524, + "grad_norm": 86.29839932342362, + "learning_rate": 1.7063492063492064e-07, + "logits/chosen": 4.46875, + "logits/rejected": 4.8125, + "logps/chosen": -825.0, + "logps/rejected": -537.5, + "loss": 0.9019, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0699920654296875, + "rewards/margins": 0.49609375, + "rewards/rejected": -0.56591796875, + "step": 87 + }, + { + "epoch": 0.017461183590455877, + "grad_norm": 78.4275593759685, + "learning_rate": 1.726190476190476e-07, + "logits/chosen": 4.65625, + "logits/rejected": 4.6640625, + "logps/chosen": -776.0, + "logps/rejected": -470.5, + "loss": 0.9636, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0902099609375, + "rewards/margins": 0.14935302734375, + "rewards/rejected": -0.2393798828125, + "step": 88 + }, + { + "epoch": 0.01765960613125651, + "grad_norm": 91.22223666771734, + "learning_rate": 1.7460317460317458e-07, + "logits/chosen": 4.53515625, + "logits/rejected": 4.66796875, + "logps/chosen": -1042.0, + "logps/rejected": -604.0, + "loss": 0.9286, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0677490234375, + "rewards/margins": 0.311279296875, + "rewards/rejected": -0.37841796875, + "step": 89 + }, + { + "epoch": 0.017858028672057144, + "grad_norm": 75.49955143353483, + "learning_rate": 1.765873015873016e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.5234375, + "logps/chosen": -938.0, + "logps/rejected": -582.5, + "loss": 0.9469, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.11147308349609375, + "rewards/margins": 0.24310302734375, + "rewards/rejected": -0.35528564453125, + "step": 90 + }, + { + "epoch": 0.01805645121285778, + "grad_norm": 80.71897420636664, + "learning_rate": 1.7857142857142858e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.5234375, + "logps/chosen": -912.0, + "logps/rejected": -746.0, + "loss": 0.9012, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12445068359375, + "rewards/margins": 0.46044921875, + "rewards/rejected": -0.33636474609375, + "step": 91 + }, + { + "epoch": 0.018254873753658415, + "grad_norm": 141.20747878106073, + "learning_rate": 1.8055555555555554e-07, + "logits/chosen": 4.015625, + "logits/rejected": 4.2734375, + "logps/chosen": -1009.0, + "logps/rejected": -1356.0, + "loss": 0.8951, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.0145721435546875, + "rewards/margins": 0.472076416015625, + "rewards/rejected": -0.4580078125, + "step": 92 + }, + { + "epoch": 0.018453296294459052, + "grad_norm": 99.8240417320946, + "learning_rate": 1.8253968253968252e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 4.4609375, + "logps/chosen": -1143.0, + "logps/rejected": -705.5, + "loss": 0.8501, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0312347412109375, + "rewards/margins": 0.71875, + "rewards/rejected": -0.74755859375, + "step": 93 + }, + { + "epoch": 0.018651718835259685, + "grad_norm": 96.54889430130858, + "learning_rate": 1.8452380952380953e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.421875, + "logps/chosen": -1353.0, + "logps/rejected": -776.0, + "loss": 0.8545, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.02001953125, + "rewards/margins": 0.685546875, + "rewards/rejected": -0.70458984375, + "step": 94 + }, + { + "epoch": 0.01885014137606032, + "grad_norm": 77.99562230669599, + "learning_rate": 1.865079365079365e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.734375, + "logps/chosen": -918.0, + "logps/rejected": -654.0, + "loss": 0.8751, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.045257568359375, + "rewards/margins": 0.730224609375, + "rewards/rejected": -0.773681640625, + "step": 95 + }, + { + "epoch": 0.019048563916860956, + "grad_norm": 83.74798414703336, + "learning_rate": 1.8849206349206347e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.71484375, + "logps/chosen": -1067.0, + "logps/rejected": -702.0, + "loss": 0.8761, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.089630126953125, + "rewards/margins": 0.6259765625, + "rewards/rejected": -0.71484375, + "step": 96 + }, + { + "epoch": 0.01924698645766159, + "grad_norm": 92.23007488742385, + "learning_rate": 1.9047619047619045e-07, + "logits/chosen": 4.6796875, + "logits/rejected": 4.6953125, + "logps/chosen": -806.5, + "logps/rejected": -1207.0, + "loss": 0.9276, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.1048583984375, + "rewards/margins": 0.338592529296875, + "rewards/rejected": -0.443603515625, + "step": 97 + }, + { + "epoch": 0.019445408998462227, + "grad_norm": 93.1366862352412, + "learning_rate": 1.9246031746031746e-07, + "logits/chosen": 4.29296875, + "logits/rejected": 4.5234375, + "logps/chosen": -1230.0, + "logps/rejected": -925.0, + "loss": 0.8717, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.03900146484375, + "rewards/margins": 0.6552734375, + "rewards/rejected": -0.6943359375, + "step": 98 + }, + { + "epoch": 0.01964383153926286, + "grad_norm": 87.40805671601977, + "learning_rate": 1.9444444444444445e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.21875, + "logps/chosen": -1552.0, + "logps/rejected": -916.0, + "loss": 0.8375, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.067626953125, + "rewards/margins": 0.86328125, + "rewards/rejected": -0.931640625, + "step": 99 + }, + { + "epoch": 0.019842254080063494, + "grad_norm": 88.60151689088583, + "learning_rate": 1.964285714285714e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.375, + "logps/chosen": -1165.0, + "logps/rejected": -847.0, + "loss": 0.8452, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.08956336975097656, + "rewards/margins": 0.9052734375, + "rewards/rejected": -0.99560546875, + "step": 100 + }, + { + "epoch": 0.02004067662086413, + "grad_norm": 74.63720500383481, + "learning_rate": 1.984126984126984e-07, + "logits/chosen": 4.26171875, + "logits/rejected": 4.453125, + "logps/chosen": -996.0, + "logps/rejected": -805.5, + "loss": 0.8742, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0615234375, + "rewards/margins": 0.693359375, + "rewards/rejected": -0.7548828125, + "step": 101 + }, + { + "epoch": 0.020239099161664765, + "grad_norm": 88.59017268017108, + "learning_rate": 2.003968253968254e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.4609375, + "logps/chosen": -1084.0, + "logps/rejected": -582.0, + "loss": 0.8571, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.0626220703125, + "rewards/margins": 0.72998046875, + "rewards/rejected": -0.7919921875, + "step": 102 + }, + { + "epoch": 0.0204375217024654, + "grad_norm": 83.36142224618438, + "learning_rate": 2.0238095238095238e-07, + "logits/chosen": 4.09375, + "logits/rejected": 4.28515625, + "logps/chosen": -1042.0, + "logps/rejected": -909.0, + "loss": 0.854, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14422607421875, + "rewards/margins": 0.80224609375, + "rewards/rejected": -0.9443359375, + "step": 103 + }, + { + "epoch": 0.020635944243266036, + "grad_norm": 81.27113496236007, + "learning_rate": 2.0436507936507934e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.9921875, + "logps/chosen": -1164.0, + "logps/rejected": -660.0, + "loss": 0.8529, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0493621826171875, + "rewards/margins": 0.833984375, + "rewards/rejected": -0.8828125, + "step": 104 + }, + { + "epoch": 0.02083436678406667, + "grad_norm": 144.67656997326725, + "learning_rate": 2.0634920634920632e-07, + "logits/chosen": 4.55078125, + "logits/rejected": 4.6484375, + "logps/chosen": -1687.0, + "logps/rejected": -472.5, + "loss": 0.8932, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1790771484375, + "rewards/margins": 0.58447265625, + "rewards/rejected": -0.7626953125, + "step": 105 + }, + { + "epoch": 0.021032789324867306, + "grad_norm": 145.30207922564662, + "learning_rate": 2.0833333333333333e-07, + "logits/chosen": 4.53125, + "logits/rejected": 4.9140625, + "logps/chosen": -936.0, + "logps/rejected": -1490.5, + "loss": 0.9088, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.198974609375, + "rewards/margins": 0.5771484375, + "rewards/rejected": -0.7744140625, + "step": 106 + }, + { + "epoch": 0.02123121186566794, + "grad_norm": 103.53031552148362, + "learning_rate": 2.1031746031746032e-07, + "logits/chosen": 4.4375, + "logits/rejected": 4.5546875, + "logps/chosen": -1280.0, + "logps/rejected": -1373.0, + "loss": 0.8169, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11447906494140625, + "rewards/margins": 1.1552734375, + "rewards/rejected": -1.26953125, + "step": 107 + }, + { + "epoch": 0.021429634406468574, + "grad_norm": 73.67798745831767, + "learning_rate": 2.123015873015873e-07, + "logits/chosen": 4.64453125, + "logits/rejected": 4.9296875, + "logps/chosen": -1049.0, + "logps/rejected": -829.0, + "loss": 0.8575, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.176239013671875, + "rewards/margins": 0.8623046875, + "rewards/rejected": -1.0390625, + "step": 108 + }, + { + "epoch": 0.02162805694726921, + "grad_norm": 73.3606696549211, + "learning_rate": 2.1428571428571426e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.5, + "logps/chosen": -958.0, + "logps/rejected": -604.0, + "loss": 0.8235, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.03204345703125, + "rewards/margins": 0.9873046875, + "rewards/rejected": -1.0205078125, + "step": 109 + }, + { + "epoch": 0.021826479488069844, + "grad_norm": 68.5292092461208, + "learning_rate": 2.1626984126984127e-07, + "logits/chosen": 4.578125, + "logits/rejected": 4.71875, + "logps/chosen": -899.0, + "logps/rejected": -635.0, + "loss": 0.8982, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.2139892578125, + "rewards/margins": 0.73291015625, + "rewards/rejected": -0.9482421875, + "step": 110 + }, + { + "epoch": 0.02202490202887048, + "grad_norm": 116.61603300120338, + "learning_rate": 2.1825396825396825e-07, + "logits/chosen": 4.8125, + "logits/rejected": 5.03125, + "logps/chosen": -1002.0, + "logps/rejected": -1673.0, + "loss": 0.8704, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.110107421875, + "rewards/margins": 0.7890625, + "rewards/rejected": -0.8994140625, + "step": 111 + }, + { + "epoch": 0.022223324569671115, + "grad_norm": 114.45405780531215, + "learning_rate": 2.2023809523809523e-07, + "logits/chosen": 4.703125, + "logits/rejected": 4.66015625, + "logps/chosen": -1099.0, + "logps/rejected": -921.0, + "loss": 0.8362, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.1785125732421875, + "rewards/margins": 1.2880859375, + "rewards/rejected": -1.4658203125, + "step": 112 + }, + { + "epoch": 0.02242174711047175, + "grad_norm": 69.13233121249701, + "learning_rate": 2.222222222222222e-07, + "logits/chosen": 4.55078125, + "logits/rejected": 4.671875, + "logps/chosen": -1235.0, + "logps/rejected": -693.0, + "loss": 0.783, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.072509765625, + "rewards/margins": 1.365966796875, + "rewards/rejected": -1.2958984375, + "step": 113 + }, + { + "epoch": 0.022620169651272386, + "grad_norm": 78.64007184945415, + "learning_rate": 2.242063492063492e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 4.3203125, + "logps/chosen": -1258.0, + "logps/rejected": -802.0, + "loss": 0.8578, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.3125, + "rewards/margins": 1.2841796875, + "rewards/rejected": -1.5966796875, + "step": 114 + }, + { + "epoch": 0.02281859219207302, + "grad_norm": 62.814295251062504, + "learning_rate": 2.2619047619047619e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.453125, + "logps/chosen": -727.5, + "logps/rejected": -521.5, + "loss": 0.8657, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1873779296875, + "rewards/margins": 0.81982421875, + "rewards/rejected": -1.00537109375, + "step": 115 + }, + { + "epoch": 0.023017014732873653, + "grad_norm": 108.50010653960325, + "learning_rate": 2.2817460317460317e-07, + "logits/chosen": 4.484375, + "logits/rejected": 4.8046875, + "logps/chosen": -1351.0, + "logps/rejected": -713.5, + "loss": 0.902, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.21337890625, + "rewards/margins": 0.6748046875, + "rewards/rejected": -0.88818359375, + "step": 116 + }, + { + "epoch": 0.02321543727367429, + "grad_norm": 73.03583260842197, + "learning_rate": 2.3015873015873013e-07, + "logits/chosen": 4.53125, + "logits/rejected": 4.89453125, + "logps/chosen": -850.0, + "logps/rejected": -661.0, + "loss": 0.8776, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.285888671875, + "rewards/margins": 1.076171875, + "rewards/rejected": -1.361328125, + "step": 117 + }, + { + "epoch": 0.023413859814474924, + "grad_norm": 88.72276785869919, + "learning_rate": 2.3214285714285714e-07, + "logits/chosen": 4.7734375, + "logits/rejected": 4.69921875, + "logps/chosen": -1081.0, + "logps/rejected": -870.5, + "loss": 0.801, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.009735107421875, + "rewards/margins": 1.25048828125, + "rewards/rejected": -1.2607421875, + "step": 118 + }, + { + "epoch": 0.02361228235527556, + "grad_norm": 71.81421963658579, + "learning_rate": 2.3412698412698412e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.3828125, + "logps/chosen": -984.0, + "logps/rejected": -573.5, + "loss": 0.8805, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.188720703125, + "rewards/margins": 0.95947265625, + "rewards/rejected": -1.146484375, + "step": 119 + }, + { + "epoch": 0.023810704896076194, + "grad_norm": 72.1124379790669, + "learning_rate": 2.361111111111111e-07, + "logits/chosen": 4.625, + "logits/rejected": 4.58984375, + "logps/chosen": -1055.0, + "logps/rejected": -653.0, + "loss": 0.8455, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.17000579833984375, + "rewards/margins": 1.1259765625, + "rewards/rejected": -1.294921875, + "step": 120 + }, + { + "epoch": 0.024009127436876828, + "grad_norm": 60.950502436767344, + "learning_rate": 2.3809523809523806e-07, + "logits/chosen": 3.921875, + "logits/rejected": 3.7578125, + "logps/chosen": -969.0, + "logps/rejected": -527.75, + "loss": 0.8372, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.182861328125, + "rewards/margins": 1.00244140625, + "rewards/rejected": -1.1826171875, + "step": 121 + }, + { + "epoch": 0.024207549977677465, + "grad_norm": 66.64373425859932, + "learning_rate": 2.4007936507936507e-07, + "logits/chosen": 4.6640625, + "logits/rejected": 5.0703125, + "logps/chosen": -720.0, + "logps/rejected": -819.0, + "loss": 0.8663, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.162689208984375, + "rewards/margins": 0.82421875, + "rewards/rejected": -0.9873046875, + "step": 122 + }, + { + "epoch": 0.0244059725184781, + "grad_norm": 90.08783464445922, + "learning_rate": 2.4206349206349205e-07, + "logits/chosen": 4.015625, + "logits/rejected": 4.5703125, + "logps/chosen": -957.0, + "logps/rejected": -1415.5, + "loss": 0.8328, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1410369873046875, + "rewards/margins": 1.4228515625, + "rewards/rejected": -1.5634765625, + "step": 123 + }, + { + "epoch": 0.024604395059278732, + "grad_norm": 90.30587961911043, + "learning_rate": 2.4404761904761904e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.4609375, + "logps/chosen": -1222.0, + "logps/rejected": -871.0, + "loss": 0.7966, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.15399169921875, + "rewards/margins": 1.7666015625, + "rewards/rejected": -1.921875, + "step": 124 + }, + { + "epoch": 0.02480281760007937, + "grad_norm": 79.97754188781265, + "learning_rate": 2.46031746031746e-07, + "logits/chosen": 4.578125, + "logits/rejected": 4.58984375, + "logps/chosen": -1155.0, + "logps/rejected": -791.5, + "loss": 0.8071, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.25537109375, + "rewards/margins": 1.4365234375, + "rewards/rejected": -1.6943359375, + "step": 125 + }, + { + "epoch": 0.025001240140880003, + "grad_norm": 75.658675560306, + "learning_rate": 2.48015873015873e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.453125, + "logps/chosen": -1028.5, + "logps/rejected": -1085.5, + "loss": 0.8696, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.2520751953125, + "rewards/margins": 1.21875, + "rewards/rejected": -1.4677734375, + "step": 126 + }, + { + "epoch": 0.02519966268168064, + "grad_norm": 74.89868707354677, + "learning_rate": 2.5e-07, + "logits/chosen": 4.74609375, + "logits/rejected": 4.6328125, + "logps/chosen": -943.0, + "logps/rejected": -631.5, + "loss": 0.8395, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.068115234375, + "rewards/margins": 0.9697265625, + "rewards/rejected": -1.03515625, + "step": 127 + }, + { + "epoch": 0.025398085222481274, + "grad_norm": 73.3349093009553, + "learning_rate": 2.5198412698412697e-07, + "logits/chosen": 4.7421875, + "logits/rejected": 4.71875, + "logps/chosen": -1070.0, + "logps/rejected": -656.0, + "loss": 0.8315, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0703125, + "rewards/margins": 1.11279296875, + "rewards/rejected": -1.1845703125, + "step": 128 + }, + { + "epoch": 0.025596507763281907, + "grad_norm": 72.26288882232456, + "learning_rate": 2.5396825396825396e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.5, + "logps/chosen": -1135.0, + "logps/rejected": -1637.5, + "loss": 0.821, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.143310546875, + "rewards/margins": 1.537109375, + "rewards/rejected": -1.681640625, + "step": 129 + }, + { + "epoch": 0.025794930304082545, + "grad_norm": 93.21094330865085, + "learning_rate": 2.5595238095238094e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.47265625, + "logps/chosen": -1430.0, + "logps/rejected": -795.5, + "loss": 0.8119, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.162109375, + "rewards/margins": 1.0302734375, + "rewards/rejected": -1.189453125, + "step": 130 + }, + { + "epoch": 0.025993352844883178, + "grad_norm": 76.78124041174908, + "learning_rate": 2.57936507936508e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.3828125, + "logps/chosen": -1213.0, + "logps/rejected": -1046.0, + "loss": 0.8416, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.27490234375, + "rewards/margins": 1.26171875, + "rewards/rejected": -1.53759765625, + "step": 131 + }, + { + "epoch": 0.026191775385683815, + "grad_norm": 74.06667506628604, + "learning_rate": 2.599206349206349e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.7265625, + "logps/chosen": -755.0, + "logps/rejected": -791.0, + "loss": 0.8673, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.095703125, + "rewards/margins": 0.763671875, + "rewards/rejected": -0.8583984375, + "step": 132 + }, + { + "epoch": 0.02639019792648445, + "grad_norm": 80.95016700268715, + "learning_rate": 2.619047619047619e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 4.6640625, + "logps/chosen": -1274.0, + "logps/rejected": -871.0, + "loss": 0.8007, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.228485107421875, + "rewards/margins": 1.30078125, + "rewards/rejected": -1.529296875, + "step": 133 + }, + { + "epoch": 0.026588620467285082, + "grad_norm": 80.97285228732666, + "learning_rate": 2.638888888888889e-07, + "logits/chosen": 4.5234375, + "logits/rejected": 4.859375, + "logps/chosen": -1160.0, + "logps/rejected": -863.0, + "loss": 0.8267, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.148193359375, + "rewards/margins": 1.3564453125, + "rewards/rejected": -1.505859375, + "step": 134 + }, + { + "epoch": 0.02678704300808572, + "grad_norm": 73.07145118803645, + "learning_rate": 2.6587301587301586e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.7109375, + "logps/chosen": -936.0, + "logps/rejected": -671.0, + "loss": 0.7715, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.0517425537109375, + "rewards/margins": 1.302734375, + "rewards/rejected": -1.2509765625, + "step": 135 + }, + { + "epoch": 0.026985465548886353, + "grad_norm": 73.08970356518728, + "learning_rate": 2.6785714285714284e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.7265625, + "logps/chosen": -1309.0, + "logps/rejected": -826.0, + "loss": 0.7756, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.15130615234375, + "rewards/margins": 1.5107421875, + "rewards/rejected": -1.6611328125, + "step": 136 + }, + { + "epoch": 0.027183888089686987, + "grad_norm": 67.8794712264055, + "learning_rate": 2.698412698412698e-07, + "logits/chosen": 4.62890625, + "logits/rejected": 4.625, + "logps/chosen": -963.0, + "logps/rejected": -550.0, + "loss": 0.8506, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.02978515625, + "rewards/margins": 1.05859375, + "rewards/rejected": -1.0888671875, + "step": 137 + }, + { + "epoch": 0.027382310630487624, + "grad_norm": 89.5397878886822, + "learning_rate": 2.718253968253968e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 4.25, + "logps/chosen": -1201.0, + "logps/rejected": -536.5, + "loss": 0.8558, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.1190185546875, + "rewards/margins": 0.869140625, + "rewards/rejected": -0.98779296875, + "step": 138 + }, + { + "epoch": 0.027580733171288258, + "grad_norm": 61.107683315328146, + "learning_rate": 2.7380952380952385e-07, + "logits/chosen": 4.51953125, + "logits/rejected": 4.46875, + "logps/chosen": -651.5, + "logps/rejected": -625.5, + "loss": 0.8488, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.03749847412109375, + "rewards/margins": 1.0380859375, + "rewards/rejected": -1.07421875, + "step": 139 + }, + { + "epoch": 0.027779155712088895, + "grad_norm": 67.02695681997847, + "learning_rate": 2.757936507936508e-07, + "logits/chosen": 4.44921875, + "logits/rejected": 4.546875, + "logps/chosen": -987.0, + "logps/rejected": -524.5, + "loss": 0.8408, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26898193359375, + "rewards/margins": 0.9951171875, + "rewards/rejected": -1.2626953125, + "step": 140 + }, + { + "epoch": 0.027977578252889528, + "grad_norm": 78.12674986291275, + "learning_rate": 2.7777777777777776e-07, + "logits/chosen": 4.26171875, + "logits/rejected": 4.3984375, + "logps/chosen": -1105.5, + "logps/rejected": -759.5, + "loss": 0.7961, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.0194091796875, + "rewards/margins": 1.2998046875, + "rewards/rejected": -1.3173828125, + "step": 141 + }, + { + "epoch": 0.028176000793690162, + "grad_norm": 71.94704099654177, + "learning_rate": 2.7976190476190474e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 4.38671875, + "logps/chosen": -1034.0, + "logps/rejected": -756.0, + "loss": 0.7841, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.156158447265625, + "rewards/margins": 1.91796875, + "rewards/rejected": -1.76171875, + "step": 142 + }, + { + "epoch": 0.0283744233344908, + "grad_norm": 75.96112225549004, + "learning_rate": 2.8174603174603173e-07, + "logits/chosen": 4.75, + "logits/rejected": 4.8125, + "logps/chosen": -1050.0, + "logps/rejected": -716.0, + "loss": 0.8197, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.0299072265625, + "rewards/margins": 1.130859375, + "rewards/rejected": -1.1630859375, + "step": 143 + }, + { + "epoch": 0.028572845875291433, + "grad_norm": 70.21818651147845, + "learning_rate": 2.837301587301587e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.546875, + "logps/chosen": -1118.0, + "logps/rejected": -1293.0, + "loss": 0.7755, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.1007080078125, + "rewards/margins": 1.72265625, + "rewards/rejected": -1.623046875, + "step": 144 + }, + { + "epoch": 0.02877126841609207, + "grad_norm": 80.31714266229748, + "learning_rate": 2.857142857142857e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.40625, + "logps/chosen": -1191.0, + "logps/rejected": -707.5, + "loss": 0.8378, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.35986328125, + "rewards/margins": 1.416015625, + "rewards/rejected": -1.775390625, + "step": 145 + }, + { + "epoch": 0.028969690956892703, + "grad_norm": 85.77488097689277, + "learning_rate": 2.876984126984127e-07, + "logits/chosen": 4.625, + "logits/rejected": 4.59375, + "logps/chosen": -1146.0, + "logps/rejected": -778.0, + "loss": 0.8145, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.0170745849609375, + "rewards/margins": 1.6220703125, + "rewards/rejected": -1.640625, + "step": 146 + }, + { + "epoch": 0.029168113497693337, + "grad_norm": 71.01504965875304, + "learning_rate": 2.896825396825397e-07, + "logits/chosen": 4.38671875, + "logits/rejected": 4.828125, + "logps/chosen": -851.0, + "logps/rejected": -617.5, + "loss": 0.8187, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.1025390625, + "rewards/margins": 1.0146484375, + "rewards/rejected": -0.9111328125, + "step": 147 + }, + { + "epoch": 0.029366536038493974, + "grad_norm": 66.5614321752951, + "learning_rate": 2.916666666666667e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.51171875, + "logps/chosen": -712.0, + "logps/rejected": -648.0, + "loss": 0.8762, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.141845703125, + "rewards/margins": 0.6728515625, + "rewards/rejected": -0.8125, + "step": 148 + }, + { + "epoch": 0.029564958579294608, + "grad_norm": 82.34640474112035, + "learning_rate": 2.9365079365079363e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.69921875, + "logps/chosen": -1090.0, + "logps/rejected": -1724.0, + "loss": 0.7563, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.0201416015625, + "rewards/margins": 1.88671875, + "rewards/rejected": -1.865234375, + "step": 149 + }, + { + "epoch": 0.02976338112009524, + "grad_norm": 56.08645716266908, + "learning_rate": 2.956349206349206e-07, + "logits/chosen": 4.8515625, + "logits/rejected": 4.796875, + "logps/chosen": -994.0, + "logps/rejected": -1354.0, + "loss": 0.8063, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.136474609375, + "rewards/margins": 1.52587890625, + "rewards/rejected": -1.662353515625, + "step": 150 + }, + { + "epoch": 0.02996180366089588, + "grad_norm": 70.20464404612031, + "learning_rate": 2.976190476190476e-07, + "logits/chosen": 4.6328125, + "logits/rejected": 4.9140625, + "logps/chosen": -1040.0, + "logps/rejected": -1672.0, + "loss": 0.8259, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.01324462890625, + "rewards/margins": 1.3935546875, + "rewards/rejected": -1.40625, + "step": 151 + }, + { + "epoch": 0.030160226201696512, + "grad_norm": 85.80513486109449, + "learning_rate": 2.996031746031746e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.6171875, + "logps/chosen": -1464.0, + "logps/rejected": -808.0, + "loss": 0.7574, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02325439453125, + "rewards/margins": 1.701171875, + "rewards/rejected": -1.67578125, + "step": 152 + }, + { + "epoch": 0.03035864874249715, + "grad_norm": 91.30905588493547, + "learning_rate": 3.0158730158730156e-07, + "logits/chosen": 4.59375, + "logits/rejected": 4.6875, + "logps/chosen": -868.0, + "logps/rejected": -670.0, + "loss": 0.8153, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.27978515625, + "rewards/margins": 1.1533203125, + "rewards/rejected": -0.87451171875, + "step": 153 + }, + { + "epoch": 0.030557071283297783, + "grad_norm": 67.82622930810354, + "learning_rate": 3.0357142857142855e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.2578125, + "logps/chosen": -950.0, + "logps/rejected": -654.5, + "loss": 0.7897, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.03570556640625, + "rewards/margins": 1.47021484375, + "rewards/rejected": -1.4326171875, + "step": 154 + }, + { + "epoch": 0.030755493824098416, + "grad_norm": 96.7734662987067, + "learning_rate": 3.055555555555556e-07, + "logits/chosen": 4.453125, + "logits/rejected": 4.8046875, + "logps/chosen": -1392.0, + "logps/rejected": -784.0, + "loss": 0.7322, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.009368896484375, + "rewards/margins": 1.873046875, + "rewards/rejected": -1.880859375, + "step": 155 + }, + { + "epoch": 0.030953916364899053, + "grad_norm": 63.28668668778288, + "learning_rate": 3.0753968253968257e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.32421875, + "logps/chosen": -764.0, + "logps/rejected": -717.0, + "loss": 0.835, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.141876220703125, + "rewards/margins": 0.960693359375, + "rewards/rejected": -1.10498046875, + "step": 156 + }, + { + "epoch": 0.031152338905699687, + "grad_norm": 81.00610717511982, + "learning_rate": 3.095238095238095e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 4.421875, + "logps/chosen": -927.0, + "logps/rejected": -1621.0, + "loss": 0.8358, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.023284912109375, + "rewards/margins": 1.5322265625, + "rewards/rejected": -1.5517578125, + "step": 157 + }, + { + "epoch": 0.03135076144650032, + "grad_norm": 74.07996230077758, + "learning_rate": 3.115079365079365e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 4.390625, + "logps/chosen": -1214.0, + "logps/rejected": -706.0, + "loss": 0.8054, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.0576171875, + "rewards/margins": 1.412109375, + "rewards/rejected": -1.4677734375, + "step": 158 + }, + { + "epoch": 0.03154918398730096, + "grad_norm": 63.41699116769845, + "learning_rate": 3.1349206349206346e-07, + "logits/chosen": 4.75, + "logits/rejected": 4.9375, + "logps/chosen": -723.5, + "logps/rejected": -900.5, + "loss": 0.8064, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.08489990234375, + "rewards/margins": 1.2509765625, + "rewards/rejected": -1.1650390625, + "step": 159 + }, + { + "epoch": 0.031747606528101595, + "grad_norm": 98.11909036110352, + "learning_rate": 3.1547619047619045e-07, + "logits/chosen": 4.59765625, + "logits/rejected": 5.1484375, + "logps/chosen": -910.0, + "logps/rejected": -744.0, + "loss": 0.813, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.0792236328125, + "rewards/margins": 1.154296875, + "rewards/rejected": -1.0771484375, + "step": 160 + }, + { + "epoch": 0.031946029068902225, + "grad_norm": 64.59521629690079, + "learning_rate": 3.1746031746031743e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.5546875, + "logps/chosen": -1069.0, + "logps/rejected": -1300.0, + "loss": 0.8145, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.145263671875, + "rewards/margins": 1.48828125, + "rewards/rejected": -1.634765625, + "step": 161 + }, + { + "epoch": 0.03214445160970286, + "grad_norm": 67.49430086497642, + "learning_rate": 3.194444444444444e-07, + "logits/chosen": 4.8125, + "logits/rejected": 5.0234375, + "logps/chosen": -803.5, + "logps/rejected": -930.0, + "loss": 0.7795, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07452392578125, + "rewards/margins": 1.4462890625, + "rewards/rejected": -1.373046875, + "step": 162 + }, + { + "epoch": 0.0323428741505035, + "grad_norm": 93.17660455843453, + "learning_rate": 3.2142857142857145e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 4.546875, + "logps/chosen": -595.0, + "logps/rejected": -928.0, + "loss": 0.8182, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.072509765625, + "rewards/margins": 1.134765625, + "rewards/rejected": -1.0654296875, + "step": 163 + }, + { + "epoch": 0.03254129669130413, + "grad_norm": 56.499741361705, + "learning_rate": 3.2341269841269844e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 4.12890625, + "logps/chosen": -657.0, + "logps/rejected": -655.5, + "loss": 0.8583, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0430908203125, + "rewards/margins": 0.876953125, + "rewards/rejected": -0.8330078125, + "step": 164 + }, + { + "epoch": 0.032739719232104766, + "grad_norm": 70.80940729713897, + "learning_rate": 3.2539682539682537e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 3.625, + "logps/chosen": -881.0, + "logps/rejected": -673.0, + "loss": 0.8519, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.033203125, + "rewards/margins": 1.205078125, + "rewards/rejected": -1.17333984375, + "step": 165 + }, + { + "epoch": 0.032938141772905404, + "grad_norm": 60.52569562286382, + "learning_rate": 3.2738095238095235e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 3.99609375, + "logps/chosen": -1011.0, + "logps/rejected": -569.0, + "loss": 0.7635, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.2310791015625, + "rewards/margins": 1.41796875, + "rewards/rejected": -1.1865234375, + "step": 166 + }, + { + "epoch": 0.033136564313706034, + "grad_norm": 77.79032174266467, + "learning_rate": 3.2936507936507933e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.4296875, + "logps/chosen": -1145.0, + "logps/rejected": -927.0, + "loss": 0.7387, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.078369140625, + "rewards/margins": 1.794921875, + "rewards/rejected": -1.71484375, + "step": 167 + }, + { + "epoch": 0.03333498685450667, + "grad_norm": 74.63821903240478, + "learning_rate": 3.313492063492063e-07, + "logits/chosen": 4.36328125, + "logits/rejected": 4.5390625, + "logps/chosen": -1141.0, + "logps/rejected": -1693.0, + "loss": 0.7383, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06097412109375, + "rewards/margins": 2.302734375, + "rewards/rejected": -2.2421875, + "step": 168 + }, + { + "epoch": 0.03353340939530731, + "grad_norm": 70.91990964392686, + "learning_rate": 3.333333333333333e-07, + "logits/chosen": 4.3515625, + "logits/rejected": 4.546875, + "logps/chosen": -806.5, + "logps/rejected": -724.5, + "loss": 0.7828, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.2012939453125, + "rewards/margins": 1.1044921875, + "rewards/rejected": -0.9033203125, + "step": 169 + }, + { + "epoch": 0.033731831936107945, + "grad_norm": 80.8837904456925, + "learning_rate": 3.353174603174603e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.58984375, + "logps/chosen": -1350.0, + "logps/rejected": -941.0, + "loss": 0.6802, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.293212890625, + "rewards/margins": 2.283203125, + "rewards/rejected": -1.98828125, + "step": 170 + }, + { + "epoch": 0.033930254476908575, + "grad_norm": 69.83858397810432, + "learning_rate": 3.373015873015873e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.5234375, + "logps/chosen": -973.0, + "logps/rejected": -703.5, + "loss": 0.7889, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.1190185546875, + "rewards/margins": 1.431640625, + "rewards/rejected": -1.3154296875, + "step": 171 + }, + { + "epoch": 0.03412867701770921, + "grad_norm": 67.28296145123608, + "learning_rate": 3.392857142857143e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.90234375, + "logps/chosen": -970.0, + "logps/rejected": -579.0, + "loss": 0.7943, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.092529296875, + "rewards/margins": 1.162109375, + "rewards/rejected": -1.068359375, + "step": 172 + }, + { + "epoch": 0.03432709955850985, + "grad_norm": 69.03412755783015, + "learning_rate": 3.412698412698413e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.6171875, + "logps/chosen": -1038.0, + "logps/rejected": -709.0, + "loss": 0.766, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.400146484375, + "rewards/margins": 1.57421875, + "rewards/rejected": -1.1728515625, + "step": 173 + }, + { + "epoch": 0.03452552209931048, + "grad_norm": 155.1647120398018, + "learning_rate": 3.432539682539682e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 4.55078125, + "logps/chosen": -1031.0, + "logps/rejected": -1740.0, + "loss": 0.7102, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.39501953125, + "rewards/margins": 1.859375, + "rewards/rejected": -1.462890625, + "step": 174 + }, + { + "epoch": 0.03472394464011112, + "grad_norm": 73.64845481516333, + "learning_rate": 3.452380952380952e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.40625, + "logps/chosen": -994.0, + "logps/rejected": -892.0, + "loss": 0.7884, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3031005859375, + "rewards/margins": 1.76953125, + "rewards/rejected": -1.466796875, + "step": 175 + }, + { + "epoch": 0.034922367180911754, + "grad_norm": 70.55825597476664, + "learning_rate": 3.472222222222222e-07, + "logits/chosen": 4.52734375, + "logits/rejected": 4.734375, + "logps/chosen": -1297.0, + "logps/rejected": -750.75, + "loss": 0.8402, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.334228515625, + "rewards/margins": 0.728515625, + "rewards/rejected": -1.060546875, + "step": 176 + }, + { + "epoch": 0.035120789721712384, + "grad_norm": 70.29517080177655, + "learning_rate": 3.4920634920634917e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.4921875, + "logps/chosen": -918.0, + "logps/rejected": -667.0, + "loss": 0.7937, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.1103515625, + "rewards/margins": 1.42578125, + "rewards/rejected": -1.3154296875, + "step": 177 + }, + { + "epoch": 0.03531921226251302, + "grad_norm": 64.3218810340208, + "learning_rate": 3.5119047619047615e-07, + "logits/chosen": 4.40234375, + "logits/rejected": 4.6328125, + "logps/chosen": -1036.0, + "logps/rejected": -766.0, + "loss": 0.7264, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1968994140625, + "rewards/margins": 1.869140625, + "rewards/rejected": -1.671875, + "step": 178 + }, + { + "epoch": 0.03551763480331366, + "grad_norm": 63.288353621560674, + "learning_rate": 3.531746031746032e-07, + "logits/chosen": 4.7734375, + "logits/rejected": 4.640625, + "logps/chosen": -882.0, + "logps/rejected": -616.0, + "loss": 0.7599, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.067626953125, + "rewards/margins": 1.7041015625, + "rewards/rejected": -1.6357421875, + "step": 179 + }, + { + "epoch": 0.03571605734411429, + "grad_norm": 66.94909915755058, + "learning_rate": 3.5515873015873017e-07, + "logits/chosen": 4.33984375, + "logits/rejected": 4.4453125, + "logps/chosen": -836.0, + "logps/rejected": -579.0, + "loss": 0.7643, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.203369140625, + "rewards/margins": 1.3486328125, + "rewards/rejected": -1.1455078125, + "step": 180 + }, + { + "epoch": 0.035914479884914925, + "grad_norm": 70.63143786190703, + "learning_rate": 3.5714285714285716e-07, + "logits/chosen": 4.6171875, + "logits/rejected": 4.46875, + "logps/chosen": -1019.0, + "logps/rejected": -507.0, + "loss": 0.7883, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.383056640625, + "rewards/margins": 1.2744140625, + "rewards/rejected": -0.8916015625, + "step": 181 + }, + { + "epoch": 0.03611290242571556, + "grad_norm": 72.136182599076, + "learning_rate": 3.591269841269841e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.296875, + "logps/chosen": -1115.0, + "logps/rejected": -617.0, + "loss": 0.6923, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.49560546875, + "rewards/margins": 1.8671875, + "rewards/rejected": -1.3681640625, + "step": 182 + }, + { + "epoch": 0.0363113249665162, + "grad_norm": 69.5111234489097, + "learning_rate": 3.6111111111111107e-07, + "logits/chosen": 3.62109375, + "logits/rejected": 3.85546875, + "logps/chosen": -892.0, + "logps/rejected": -608.5, + "loss": 0.7397, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4091796875, + "rewards/margins": 1.455078125, + "rewards/rejected": -1.044921875, + "step": 183 + }, + { + "epoch": 0.03650974750731683, + "grad_norm": 64.42025024585905, + "learning_rate": 3.6309523809523805e-07, + "logits/chosen": 4.33203125, + "logits/rejected": 4.46875, + "logps/chosen": -1020.0, + "logps/rejected": -682.0, + "loss": 0.7864, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.329345703125, + "rewards/margins": 1.14453125, + "rewards/rejected": -0.8154296875, + "step": 184 + }, + { + "epoch": 0.03670817004811747, + "grad_norm": 65.54638444327716, + "learning_rate": 3.6507936507936504e-07, + "logits/chosen": 4.5546875, + "logits/rejected": 4.6953125, + "logps/chosen": -991.0, + "logps/rejected": -672.0, + "loss": 0.7556, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.453857421875, + "rewards/margins": 1.44921875, + "rewards/rejected": -0.9951171875, + "step": 185 + }, + { + "epoch": 0.036906592588918104, + "grad_norm": 79.26735609128782, + "learning_rate": 3.67063492063492e-07, + "logits/chosen": 3.890625, + "logits/rejected": 4.2421875, + "logps/chosen": -973.0, + "logps/rejected": -644.0, + "loss": 0.754, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4091796875, + "rewards/margins": 1.4697265625, + "rewards/rejected": -1.056640625, + "step": 186 + }, + { + "epoch": 0.037105015129718734, + "grad_norm": 70.58815209684337, + "learning_rate": 3.6904761904761906e-07, + "logits/chosen": 4.6875, + "logits/rejected": 5.0546875, + "logps/chosen": -991.0, + "logps/rejected": -594.0, + "loss": 0.726, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.45941162109375, + "rewards/margins": 1.580078125, + "rewards/rejected": -1.119140625, + "step": 187 + }, + { + "epoch": 0.03730343767051937, + "grad_norm": 58.96641390607661, + "learning_rate": 3.7103174603174604e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.53125, + "logps/chosen": -1002.0, + "logps/rejected": -1181.5, + "loss": 0.7056, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4698486328125, + "rewards/margins": 1.9169921875, + "rewards/rejected": -1.44921875, + "step": 188 + }, + { + "epoch": 0.03750186021132001, + "grad_norm": 83.7827350779954, + "learning_rate": 3.73015873015873e-07, + "logits/chosen": 5.0234375, + "logits/rejected": 4.9453125, + "logps/chosen": -895.0, + "logps/rejected": -679.0, + "loss": 0.7382, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.66259765625, + "rewards/margins": 1.486328125, + "rewards/rejected": -0.82421875, + "step": 189 + }, + { + "epoch": 0.03770028275212064, + "grad_norm": 61.395725020788305, + "learning_rate": 3.75e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.50390625, + "logps/chosen": -1024.0, + "logps/rejected": -695.5, + "loss": 0.7661, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.663818359375, + "rewards/margins": 1.4599609375, + "rewards/rejected": -0.7978515625, + "step": 190 + }, + { + "epoch": 0.037898705292921275, + "grad_norm": 61.44242581196073, + "learning_rate": 3.7698412698412694e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.42578125, + "logps/chosen": -782.0, + "logps/rejected": -552.0, + "loss": 0.7925, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2423095703125, + "rewards/margins": 1.1982421875, + "rewards/rejected": -0.9560546875, + "step": 191 + }, + { + "epoch": 0.03809712783372191, + "grad_norm": 56.04918871994803, + "learning_rate": 3.789682539682539e-07, + "logits/chosen": 4.14453125, + "logits/rejected": 4.21484375, + "logps/chosen": -868.0, + "logps/rejected": -607.0, + "loss": 0.7493, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.68115234375, + "rewards/margins": 1.4609375, + "rewards/rejected": -0.77783203125, + "step": 192 + }, + { + "epoch": 0.03829555037452254, + "grad_norm": 76.22722183342329, + "learning_rate": 3.809523809523809e-07, + "logits/chosen": 4.37109375, + "logits/rejected": 4.34375, + "logps/chosen": -926.5, + "logps/rejected": -570.0, + "loss": 0.7867, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.46429443359375, + "rewards/margins": 1.3017578125, + "rewards/rejected": -0.840576171875, + "step": 193 + }, + { + "epoch": 0.03849397291532318, + "grad_norm": 59.77224261740247, + "learning_rate": 3.8293650793650794e-07, + "logits/chosen": 4.37109375, + "logits/rejected": 4.3359375, + "logps/chosen": -1065.0, + "logps/rejected": -656.0, + "loss": 0.7211, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.541015625, + "rewards/margins": 1.8525390625, + "rewards/rejected": -1.314453125, + "step": 194 + }, + { + "epoch": 0.03869239545612382, + "grad_norm": 62.34541286212457, + "learning_rate": 3.8492063492063493e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 4.390625, + "logps/chosen": -977.0, + "logps/rejected": -727.5, + "loss": 0.6903, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.68017578125, + "rewards/margins": 1.7646484375, + "rewards/rejected": -1.0849609375, + "step": 195 + }, + { + "epoch": 0.038890817996924454, + "grad_norm": 67.19758647761881, + "learning_rate": 3.869047619047619e-07, + "logits/chosen": 4.33984375, + "logits/rejected": 4.4609375, + "logps/chosen": -1134.0, + "logps/rejected": -818.0, + "loss": 0.7086, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.3514404296875, + "rewards/margins": 1.734375, + "rewards/rejected": -1.388671875, + "step": 196 + }, + { + "epoch": 0.039089240537725084, + "grad_norm": 68.02231663998099, + "learning_rate": 3.888888888888889e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.3359375, + "logps/chosen": -1186.0, + "logps/rejected": -676.0, + "loss": 0.6713, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.66552734375, + "rewards/margins": 1.955078125, + "rewards/rejected": -1.29052734375, + "step": 197 + }, + { + "epoch": 0.03928766307852572, + "grad_norm": 62.696142252297285, + "learning_rate": 3.908730158730159e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 4.03515625, + "logps/chosen": -1094.0, + "logps/rejected": -779.0, + "loss": 0.7294, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.748046875, + "rewards/margins": 1.822265625, + "rewards/rejected": -1.07666015625, + "step": 198 + }, + { + "epoch": 0.03948608561932636, + "grad_norm": 74.17962772233044, + "learning_rate": 3.928571428571428e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.31640625, + "logps/chosen": -1360.0, + "logps/rejected": -1105.0, + "loss": 0.6115, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.51611328125, + "rewards/margins": 2.5546875, + "rewards/rejected": -2.0390625, + "step": 199 + }, + { + "epoch": 0.03968450816012699, + "grad_norm": 56.36057323391855, + "learning_rate": 3.948412698412698e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.3359375, + "logps/chosen": -932.0, + "logps/rejected": -540.5, + "loss": 0.752, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.5302734375, + "rewards/margins": 1.37109375, + "rewards/rejected": -0.83935546875, + "step": 200 + }, + { + "epoch": 0.039882930700927625, + "grad_norm": 59.82285797792738, + "learning_rate": 3.968253968253968e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.328125, + "logps/chosen": -1070.0, + "logps/rejected": -1152.5, + "loss": 0.6926, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.629638671875, + "rewards/margins": 1.931640625, + "rewards/rejected": -1.2998046875, + "step": 201 + }, + { + "epoch": 0.04008135324172826, + "grad_norm": 80.29802832330772, + "learning_rate": 3.988095238095238e-07, + "logits/chosen": 4.015625, + "logits/rejected": 4.046875, + "logps/chosen": -1410.0, + "logps/rejected": -660.0, + "loss": 0.6782, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.759765625, + "rewards/margins": 1.865234375, + "rewards/rejected": -1.10546875, + "step": 202 + }, + { + "epoch": 0.04027977578252889, + "grad_norm": 49.87069079688846, + "learning_rate": 4.007936507936508e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 3.91015625, + "logps/chosen": -982.5, + "logps/rejected": -630.0, + "loss": 0.7475, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.9462890625, + "rewards/margins": 1.6982421875, + "rewards/rejected": -0.7548828125, + "step": 203 + }, + { + "epoch": 0.04047819832332953, + "grad_norm": 70.08849214991257, + "learning_rate": 4.027777777777778e-07, + "logits/chosen": 4.29296875, + "logits/rejected": 4.4921875, + "logps/chosen": -1215.0, + "logps/rejected": -904.0, + "loss": 0.6082, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.42218017578125, + "rewards/margins": 3.5234375, + "rewards/rejected": -3.107421875, + "step": 204 + }, + { + "epoch": 0.04067662086413017, + "grad_norm": 73.78892318716765, + "learning_rate": 4.0476190476190476e-07, + "logits/chosen": 4.75, + "logits/rejected": 4.8671875, + "logps/chosen": -1361.0, + "logps/rejected": -797.0, + "loss": 0.6427, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.53515625, + "rewards/margins": 2.181640625, + "rewards/rejected": -1.6474609375, + "step": 205 + }, + { + "epoch": 0.0408750434049308, + "grad_norm": 67.212983981667, + "learning_rate": 4.0674603174603175e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 4.51171875, + "logps/chosen": -939.0, + "logps/rejected": -683.0, + "loss": 0.6761, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.501953125, + "rewards/margins": 1.970703125, + "rewards/rejected": -1.4716796875, + "step": 206 + }, + { + "epoch": 0.041073465945731434, + "grad_norm": 61.51479430368405, + "learning_rate": 4.087301587301587e-07, + "logits/chosen": 4.4765625, + "logits/rejected": 4.6484375, + "logps/chosen": -1028.0, + "logps/rejected": -683.0, + "loss": 0.7397, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4111328125, + "rewards/margins": 1.986328125, + "rewards/rejected": -1.57421875, + "step": 207 + }, + { + "epoch": 0.04127188848653207, + "grad_norm": 85.98847872304235, + "learning_rate": 4.1071428571428566e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.14453125, + "logps/chosen": -1044.0, + "logps/rejected": -818.0, + "loss": 0.6858, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.7001953125, + "rewards/margins": 1.857421875, + "rewards/rejected": -1.1534423828125, + "step": 208 + }, + { + "epoch": 0.04147031102733271, + "grad_norm": 46.315947109796944, + "learning_rate": 4.1269841269841265e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.359375, + "logps/chosen": -587.5, + "logps/rejected": -748.5, + "loss": 0.7391, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.787109375, + "rewards/margins": 1.580078125, + "rewards/rejected": -0.792236328125, + "step": 209 + }, + { + "epoch": 0.04166873356813334, + "grad_norm": 55.67260605173662, + "learning_rate": 4.146825396825397e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.66796875, + "logps/chosen": -910.0, + "logps/rejected": -718.0, + "loss": 0.7375, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.71142578125, + "rewards/margins": 1.529296875, + "rewards/rejected": -0.81640625, + "step": 210 + }, + { + "epoch": 0.041867156108933976, + "grad_norm": 66.02706976496347, + "learning_rate": 4.1666666666666667e-07, + "logits/chosen": 4.359375, + "logits/rejected": 4.34375, + "logps/chosen": -954.0, + "logps/rejected": -784.0, + "loss": 0.7241, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6591796875, + "rewards/margins": 1.7587890625, + "rewards/rejected": -1.099609375, + "step": 211 + }, + { + "epoch": 0.04206557864973461, + "grad_norm": 65.03016527493429, + "learning_rate": 4.1865079365079365e-07, + "logits/chosen": 4.703125, + "logits/rejected": 4.6171875, + "logps/chosen": -888.5, + "logps/rejected": -493.0, + "loss": 0.7385, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.5966796875, + "rewards/margins": 1.642578125, + "rewards/rejected": -1.04296875, + "step": 212 + }, + { + "epoch": 0.04226400119053524, + "grad_norm": 55.113469311719676, + "learning_rate": 4.2063492063492063e-07, + "logits/chosen": 4.46875, + "logits/rejected": 4.5625, + "logps/chosen": -788.0, + "logps/rejected": -631.0, + "loss": 0.7289, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.592529296875, + "rewards/margins": 1.74462890625, + "rewards/rejected": -1.14990234375, + "step": 213 + }, + { + "epoch": 0.04246242373133588, + "grad_norm": 61.11929308747335, + "learning_rate": 4.226190476190476e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 4.23046875, + "logps/chosen": -1028.0, + "logps/rejected": -578.0, + "loss": 0.6218, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.94091796875, + "rewards/margins": 2.24609375, + "rewards/rejected": -1.306640625, + "step": 214 + }, + { + "epoch": 0.04266084627213652, + "grad_norm": 60.8994107654102, + "learning_rate": 4.246031746031746e-07, + "logits/chosen": 4.328125, + "logits/rejected": 4.65234375, + "logps/chosen": -650.0, + "logps/rejected": -645.0, + "loss": 0.7728, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6875, + "rewards/margins": 1.34375, + "rewards/rejected": -0.6572265625, + "step": 215 + }, + { + "epoch": 0.04285926881293715, + "grad_norm": 53.26750399211849, + "learning_rate": 4.2658730158730153e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 4.5859375, + "logps/chosen": -752.75, + "logps/rejected": -681.0, + "loss": 0.6994, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.8642578125, + "rewards/margins": 1.8232421875, + "rewards/rejected": -0.962890625, + "step": 216 + }, + { + "epoch": 0.043057691353737784, + "grad_norm": 77.86385401512746, + "learning_rate": 4.285714285714285e-07, + "logits/chosen": 4.578125, + "logits/rejected": 4.7109375, + "logps/chosen": -802.5, + "logps/rejected": -527.5, + "loss": 0.7781, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.66455078125, + "rewards/margins": 1.2841796875, + "rewards/rejected": -0.6171875, + "step": 217 + }, + { + "epoch": 0.04325611389453842, + "grad_norm": 62.97035305919545, + "learning_rate": 4.3055555555555555e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.7109375, + "logps/chosen": -1212.0, + "logps/rejected": -727.0, + "loss": 0.644, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.76806640625, + "rewards/margins": 2.251953125, + "rewards/rejected": -1.4853515625, + "step": 218 + }, + { + "epoch": 0.04345453643533905, + "grad_norm": 84.2525608361808, + "learning_rate": 4.3253968253968253e-07, + "logits/chosen": 4.546875, + "logits/rejected": 4.5390625, + "logps/chosen": -1116.0, + "logps/rejected": -796.5, + "loss": 0.6559, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.689453125, + "rewards/margins": 1.900390625, + "rewards/rejected": -1.2109375, + "step": 219 + }, + { + "epoch": 0.04365295897613969, + "grad_norm": 65.19970747619479, + "learning_rate": 4.345238095238095e-07, + "logits/chosen": 4.609375, + "logits/rejected": 4.953125, + "logps/chosen": -1052.0, + "logps/rejected": -725.0, + "loss": 0.6934, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5182952880859375, + "rewards/margins": 1.970703125, + "rewards/rejected": -1.4521484375, + "step": 220 + }, + { + "epoch": 0.043851381516940326, + "grad_norm": 60.119844185713994, + "learning_rate": 4.365079365079365e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.171875, + "logps/chosen": -1223.0, + "logps/rejected": -631.0, + "loss": 0.6398, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.471923828125, + "rewards/margins": 2.119140625, + "rewards/rejected": -1.6484375, + "step": 221 + }, + { + "epoch": 0.04404980405774096, + "grad_norm": 49.629058325956976, + "learning_rate": 4.384920634920635e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.546875, + "logps/chosen": -811.0, + "logps/rejected": -541.0, + "loss": 0.7858, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.63623046875, + "rewards/margins": 1.392578125, + "rewards/rejected": -0.754638671875, + "step": 222 + }, + { + "epoch": 0.04424822659854159, + "grad_norm": 58.673954446418115, + "learning_rate": 4.4047619047619047e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.5390625, + "logps/chosen": -1075.0, + "logps/rejected": -1221.0, + "loss": 0.6906, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1640625, + "rewards/margins": 2.078125, + "rewards/rejected": -0.916748046875, + "step": 223 + }, + { + "epoch": 0.04444664913934223, + "grad_norm": 62.213291096419525, + "learning_rate": 4.424603174603174e-07, + "logits/chosen": 4.7109375, + "logits/rejected": 4.6953125, + "logps/chosen": -1314.0, + "logps/rejected": -826.5, + "loss": 0.7548, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3216552734375, + "rewards/margins": 3.216796875, + "rewards/rejected": -2.9033203125, + "step": 224 + }, + { + "epoch": 0.04464507168014287, + "grad_norm": 54.09266759874911, + "learning_rate": 4.444444444444444e-07, + "logits/chosen": 4.3984375, + "logits/rejected": 4.6640625, + "logps/chosen": -527.5, + "logps/rejected": -606.0, + "loss": 0.728, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.69921875, + "rewards/margins": 1.5849609375, + "rewards/rejected": -0.8861083984375, + "step": 225 + }, + { + "epoch": 0.0448434942209435, + "grad_norm": 62.808459332679014, + "learning_rate": 4.464285714285714e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 4.2109375, + "logps/chosen": -1200.0, + "logps/rejected": -897.0, + "loss": 0.6273, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.5849609375, + "rewards/margins": 2.439453125, + "rewards/rejected": -1.85546875, + "step": 226 + }, + { + "epoch": 0.045041916761744134, + "grad_norm": 58.34617171190962, + "learning_rate": 4.484126984126984e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 4.4375, + "logps/chosen": -886.0, + "logps/rejected": -614.0, + "loss": 0.7413, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.66455078125, + "rewards/margins": 1.423828125, + "rewards/rejected": -0.7626953125, + "step": 227 + }, + { + "epoch": 0.04524033930254477, + "grad_norm": 64.76979478981825, + "learning_rate": 4.503968253968254e-07, + "logits/chosen": 4.453125, + "logits/rejected": 4.8125, + "logps/chosen": -1187.0, + "logps/rejected": -917.5, + "loss": 0.6445, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.84161376953125, + "rewards/margins": 2.5380859375, + "rewards/rejected": -1.699951171875, + "step": 228 + }, + { + "epoch": 0.0454387618433454, + "grad_norm": 60.43641159094996, + "learning_rate": 4.5238095238095237e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.3359375, + "logps/chosen": -1175.0, + "logps/rejected": -717.0, + "loss": 0.6432, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.748046875, + "rewards/margins": 2.5078125, + "rewards/rejected": -1.7626953125, + "step": 229 + }, + { + "epoch": 0.04563718438414604, + "grad_norm": 55.290114944192034, + "learning_rate": 4.5436507936507935e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.7734375, + "logps/chosen": -1121.0, + "logps/rejected": -876.0, + "loss": 0.6724, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.85546875, + "rewards/margins": 2.146484375, + "rewards/rejected": -1.29296875, + "step": 230 + }, + { + "epoch": 0.045835606924946676, + "grad_norm": 64.0519863666103, + "learning_rate": 4.5634920634920634e-07, + "logits/chosen": 4.0, + "logits/rejected": 4.44921875, + "logps/chosen": -967.0, + "logps/rejected": -964.0, + "loss": 0.7038, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.677734375, + "rewards/margins": 1.810546875, + "rewards/rejected": -1.1337890625, + "step": 231 + }, + { + "epoch": 0.046034029465747306, + "grad_norm": 55.87810837697565, + "learning_rate": 4.5833333333333327e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 4.2265625, + "logps/chosen": -1133.0, + "logps/rejected": -1414.5, + "loss": 0.6026, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.087890625, + "rewards/margins": 3.076171875, + "rewards/rejected": -1.990234375, + "step": 232 + }, + { + "epoch": 0.04623245200654794, + "grad_norm": 53.18126304307036, + "learning_rate": 4.6031746031746025e-07, + "logits/chosen": 4.46875, + "logits/rejected": 4.7734375, + "logps/chosen": -1122.5, + "logps/rejected": -639.0, + "loss": 0.5997, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1484375, + "rewards/margins": 2.48828125, + "rewards/rejected": -1.33984375, + "step": 233 + }, + { + "epoch": 0.04643087454734858, + "grad_norm": 64.4522574397357, + "learning_rate": 4.623015873015873e-07, + "logits/chosen": 4.47265625, + "logits/rejected": 4.3203125, + "logps/chosen": -1048.0, + "logps/rejected": -616.0, + "loss": 0.717, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.72412109375, + "rewards/margins": 1.5927734375, + "rewards/rejected": -0.8671875, + "step": 234 + }, + { + "epoch": 0.04662929708814922, + "grad_norm": 60.41682480107802, + "learning_rate": 4.6428571428571427e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.4609375, + "logps/chosen": -1113.0, + "logps/rejected": -598.0, + "loss": 0.6519, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.78759765625, + "rewards/margins": 2.05078125, + "rewards/rejected": -1.265625, + "step": 235 + }, + { + "epoch": 0.04682771962894985, + "grad_norm": 62.27539881275508, + "learning_rate": 4.6626984126984126e-07, + "logits/chosen": 4.43359375, + "logits/rejected": 4.45703125, + "logps/chosen": -1326.0, + "logps/rejected": -829.0, + "loss": 0.6368, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.0673828125, + "rewards/margins": 2.41015625, + "rewards/rejected": -1.3427734375, + "step": 236 + }, + { + "epoch": 0.047026142169750484, + "grad_norm": 51.02023368107623, + "learning_rate": 4.6825396825396824e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.33203125, + "logps/chosen": -1096.0, + "logps/rejected": -619.5, + "loss": 0.6451, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.9140625, + "rewards/margins": 2.150390625, + "rewards/rejected": -1.23516845703125, + "step": 237 + }, + { + "epoch": 0.04722456471055112, + "grad_norm": 57.747813463764395, + "learning_rate": 4.702380952380952e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 4.453125, + "logps/chosen": -914.0, + "logps/rejected": -587.5, + "loss": 0.7231, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.70703125, + "rewards/margins": 1.5390625, + "rewards/rejected": -0.8310546875, + "step": 238 + }, + { + "epoch": 0.04742298725135175, + "grad_norm": 63.17232172032774, + "learning_rate": 4.722222222222222e-07, + "logits/chosen": 3.859375, + "logits/rejected": 3.87890625, + "logps/chosen": -886.0, + "logps/rejected": -831.0, + "loss": 0.7222, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.5478515625, + "rewards/margins": 1.705078125, + "rewards/rejected": -1.1572265625, + "step": 239 + }, + { + "epoch": 0.04762140979215239, + "grad_norm": 66.54799968305063, + "learning_rate": 4.742063492063492e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.30859375, + "logps/chosen": -1184.5, + "logps/rejected": -820.0, + "loss": 0.6094, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.6767578125, + "rewards/margins": 2.486328125, + "rewards/rejected": -1.80859375, + "step": 240 + }, + { + "epoch": 0.047819832332953026, + "grad_norm": 59.661687957847015, + "learning_rate": 4.761904761904761e-07, + "logits/chosen": 4.015625, + "logits/rejected": 3.953125, + "logps/chosen": -997.0, + "logps/rejected": -608.5, + "loss": 0.6471, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.91015625, + "rewards/margins": 2.12890625, + "rewards/rejected": -1.2177734375, + "step": 241 + }, + { + "epoch": 0.048018254873753656, + "grad_norm": 43.20287460174957, + "learning_rate": 4.781746031746032e-07, + "logits/chosen": 4.4375, + "logits/rejected": 4.5078125, + "logps/chosen": -748.5, + "logps/rejected": -565.5, + "loss": 0.6575, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7060546875, + "rewards/margins": 2.431640625, + "rewards/rejected": -1.724609375, + "step": 242 + }, + { + "epoch": 0.04821667741455429, + "grad_norm": 58.426634712539276, + "learning_rate": 4.801587301587301e-07, + "logits/chosen": 4.609375, + "logits/rejected": 5.078125, + "logps/chosen": -884.0, + "logps/rejected": -867.0, + "loss": 0.6566, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.1279296875, + "rewards/margins": 2.134765625, + "rewards/rejected": -1.0068359375, + "step": 243 + }, + { + "epoch": 0.04841509995535493, + "grad_norm": 56.2120278253623, + "learning_rate": 4.821428571428571e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.1875, + "logps/chosen": -875.0, + "logps/rejected": -603.0, + "loss": 0.6557, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.39080810546875, + "rewards/margins": 2.22265625, + "rewards/rejected": -1.8330078125, + "step": 244 + }, + { + "epoch": 0.04861352249615556, + "grad_norm": 59.9754802180332, + "learning_rate": 4.841269841269841e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 4.2109375, + "logps/chosen": -1055.0, + "logps/rejected": -789.0, + "loss": 0.5659, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.78662109375, + "rewards/margins": 2.90234375, + "rewards/rejected": -2.11328125, + "step": 245 + }, + { + "epoch": 0.0488119450369562, + "grad_norm": 54.8337858282796, + "learning_rate": 4.861111111111111e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.46484375, + "logps/chosen": -1073.0, + "logps/rejected": -1884.0, + "loss": 0.6723, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5281982421875, + "rewards/margins": 2.802734375, + "rewards/rejected": -2.2734375, + "step": 246 + }, + { + "epoch": 0.049010367577756835, + "grad_norm": 66.64308093585647, + "learning_rate": 4.880952380952381e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 3.98046875, + "logps/chosen": -1006.0, + "logps/rejected": -648.0, + "loss": 0.7403, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.54443359375, + "rewards/margins": 1.505859375, + "rewards/rejected": -0.96044921875, + "step": 247 + }, + { + "epoch": 0.049208790118557465, + "grad_norm": 52.34449574911029, + "learning_rate": 4.900793650793651e-07, + "logits/chosen": 4.328125, + "logits/rejected": 4.625, + "logps/chosen": -752.0, + "logps/rejected": -714.0, + "loss": 0.6876, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.65087890625, + "rewards/margins": 2.16796875, + "rewards/rejected": -1.515625, + "step": 248 + }, + { + "epoch": 0.0494072126593581, + "grad_norm": 64.55393372366383, + "learning_rate": 4.92063492063492e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 4.203125, + "logps/chosen": -1079.0, + "logps/rejected": -726.0, + "loss": 0.675, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.80126953125, + "rewards/margins": 2.095703125, + "rewards/rejected": -1.2978515625, + "step": 249 + }, + { + "epoch": 0.04960563520015874, + "grad_norm": 62.468040126105215, + "learning_rate": 4.94047619047619e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.33203125, + "logps/chosen": -1218.0, + "logps/rejected": -839.0, + "loss": 0.6195, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.625732421875, + "rewards/margins": 4.359375, + "rewards/rejected": -3.72265625, + "step": 250 + }, + { + "epoch": 0.049804057740959376, + "grad_norm": 50.66147270128868, + "learning_rate": 4.96031746031746e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.421875, + "logps/chosen": -921.0, + "logps/rejected": -627.0, + "loss": 0.6695, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.62628173828125, + "rewards/margins": 2.3984375, + "rewards/rejected": -1.775390625, + "step": 251 + }, + { + "epoch": 0.050002480281760006, + "grad_norm": 60.17857628257001, + "learning_rate": 4.98015873015873e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.2734375, + "logps/chosen": -1091.0, + "logps/rejected": -606.5, + "loss": 0.6617, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.705078125, + "rewards/margins": 2.109375, + "rewards/rejected": -1.40625, + "step": 252 + }, + { + "epoch": 0.05020090282256064, + "grad_norm": 53.97200218748698, + "learning_rate": 5e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.22265625, + "logps/chosen": -897.0, + "logps/rejected": -538.0, + "loss": 0.653, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.647705078125, + "rewards/margins": 2.169921875, + "rewards/rejected": -1.5205078125, + "step": 253 + }, + { + "epoch": 0.05039932536336128, + "grad_norm": 66.30792981196181, + "learning_rate": 5.01984126984127e-07, + "logits/chosen": 3.71875, + "logits/rejected": 3.79296875, + "logps/chosen": -1113.0, + "logps/rejected": -1277.5, + "loss": 0.6706, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.703125, + "rewards/margins": 2.345703125, + "rewards/rejected": -1.640625, + "step": 254 + }, + { + "epoch": 0.05059774790416191, + "grad_norm": 54.47360490835319, + "learning_rate": 5.039682539682539e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.3984375, + "logps/chosen": -806.0, + "logps/rejected": -776.0, + "loss": 0.6783, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.73779296875, + "rewards/margins": 2.009765625, + "rewards/rejected": -1.2724609375, + "step": 255 + }, + { + "epoch": 0.05079617044496255, + "grad_norm": 54.66508076537417, + "learning_rate": 5.059523809523809e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.7265625, + "logps/chosen": -1007.0, + "logps/rejected": -1071.5, + "loss": 0.6506, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.738525390625, + "rewards/margins": 2.529296875, + "rewards/rejected": -1.794921875, + "step": 256 + }, + { + "epoch": 0.050994592985763185, + "grad_norm": 48.69248948692936, + "learning_rate": 5.079365079365079e-07, + "logits/chosen": 4.62109375, + "logits/rejected": 4.8203125, + "logps/chosen": -977.0, + "logps/rejected": -508.5, + "loss": 0.6861, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.64501953125, + "rewards/margins": 2.0029296875, + "rewards/rejected": -1.35546875, + "step": 257 + }, + { + "epoch": 0.051193015526563815, + "grad_norm": 44.980695103748, + "learning_rate": 5.099206349206349e-07, + "logits/chosen": 4.26171875, + "logits/rejected": 4.6171875, + "logps/chosen": -633.0, + "logps/rejected": -440.0, + "loss": 0.6874, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7392578125, + "rewards/margins": 1.880859375, + "rewards/rejected": -1.1416015625, + "step": 258 + }, + { + "epoch": 0.05139143806736445, + "grad_norm": 67.04993487462846, + "learning_rate": 5.119047619047619e-07, + "logits/chosen": 4.640625, + "logits/rejected": 4.7109375, + "logps/chosen": -1222.0, + "logps/rejected": -695.0, + "loss": 0.7072, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.53375244140625, + "rewards/margins": 1.84765625, + "rewards/rejected": -1.31494140625, + "step": 259 + }, + { + "epoch": 0.05158986060816509, + "grad_norm": 57.82680720942183, + "learning_rate": 5.138888888888889e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.3671875, + "logps/chosen": -1121.0, + "logps/rejected": -746.0, + "loss": 0.6519, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.9033203125, + "rewards/margins": 1.978515625, + "rewards/rejected": -1.07958984375, + "step": 260 + }, + { + "epoch": 0.05178828314896572, + "grad_norm": 56.772874571933166, + "learning_rate": 5.15873015873016e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.515625, + "logps/chosen": -1012.0, + "logps/rejected": -590.0, + "loss": 0.6574, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.6123046875, + "rewards/margins": 2.09375, + "rewards/rejected": -1.4833984375, + "step": 261 + }, + { + "epoch": 0.051986705689766356, + "grad_norm": 67.64157781940307, + "learning_rate": 5.178571428571428e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.23046875, + "logps/chosen": -1046.0, + "logps/rejected": -884.0, + "loss": 0.6267, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.3203125, + "rewards/margins": 2.515625, + "rewards/rejected": -1.1982421875, + "step": 262 + }, + { + "epoch": 0.05218512823056699, + "grad_norm": 52.70567291671549, + "learning_rate": 5.198412698412698e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 4.0859375, + "logps/chosen": -1296.0, + "logps/rejected": -1100.0, + "loss": 0.6337, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.98486328125, + "rewards/margins": 2.521484375, + "rewards/rejected": -1.541015625, + "step": 263 + }, + { + "epoch": 0.05238355077136763, + "grad_norm": 60.645526512625686, + "learning_rate": 5.218253968253968e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.26953125, + "logps/chosen": -1013.0, + "logps/rejected": -560.0, + "loss": 0.6216, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8251953125, + "rewards/margins": 2.48828125, + "rewards/rejected": -1.6630859375, + "step": 264 + }, + { + "epoch": 0.05258197331216826, + "grad_norm": 319.62347961566775, + "learning_rate": 5.238095238095238e-07, + "logits/chosen": 4.55078125, + "logits/rejected": 4.6484375, + "logps/chosen": -834.0, + "logps/rejected": -545.0, + "loss": 0.617, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.8681640625, + "rewards/margins": 2.2578125, + "rewards/rejected": -1.3876953125, + "step": 265 + }, + { + "epoch": 0.0527803958529689, + "grad_norm": 55.74117274148664, + "learning_rate": 5.257936507936508e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 4.32421875, + "logps/chosen": -1219.0, + "logps/rejected": -701.0, + "loss": 0.5897, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.8048095703125, + "rewards/margins": 2.33984375, + "rewards/rejected": -1.5322265625, + "step": 266 + }, + { + "epoch": 0.052978818393769535, + "grad_norm": 52.29758997544688, + "learning_rate": 5.277777777777777e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.16796875, + "logps/chosen": -901.5, + "logps/rejected": -489.5, + "loss": 0.6744, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.71435546875, + "rewards/margins": 2.09716796875, + "rewards/rejected": -1.38232421875, + "step": 267 + }, + { + "epoch": 0.053177240934570165, + "grad_norm": 68.78603412375698, + "learning_rate": 5.297619047619047e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 4.34375, + "logps/chosen": -1042.0, + "logps/rejected": -1128.5, + "loss": 0.7324, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.4703369140625, + "rewards/margins": 2.27734375, + "rewards/rejected": -1.8076171875, + "step": 268 + }, + { + "epoch": 0.0533756634753708, + "grad_norm": 57.56106421800983, + "learning_rate": 5.317460317460317e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.28125, + "logps/chosen": -1011.0, + "logps/rejected": -602.0, + "loss": 0.6438, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.73388671875, + "rewards/margins": 2.412109375, + "rewards/rejected": -1.67578125, + "step": 269 + }, + { + "epoch": 0.05357408601617144, + "grad_norm": 56.702959314399386, + "learning_rate": 5.337301587301587e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 4.44140625, + "logps/chosen": -1138.0, + "logps/rejected": -858.0, + "loss": 0.5909, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.1220703125, + "rewards/margins": 2.79296875, + "rewards/rejected": -1.67333984375, + "step": 270 + }, + { + "epoch": 0.05377250855697207, + "grad_norm": 63.093553218426784, + "learning_rate": 5.357142857142857e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.6796875, + "logps/chosen": -1340.0, + "logps/rejected": -1732.0, + "loss": 0.551, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.94677734375, + "rewards/margins": 3.99609375, + "rewards/rejected": -3.046875, + "step": 271 + }, + { + "epoch": 0.053970931097772706, + "grad_norm": 68.08747945108401, + "learning_rate": 5.376984126984127e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.13671875, + "logps/chosen": -995.0, + "logps/rejected": -602.0, + "loss": 0.6491, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7060546875, + "rewards/margins": 2.19140625, + "rewards/rejected": -1.48828125, + "step": 272 + }, + { + "epoch": 0.05416935363857334, + "grad_norm": 60.77649196490188, + "learning_rate": 5.396825396825396e-07, + "logits/chosen": 4.859375, + "logits/rejected": 4.94921875, + "logps/chosen": -1103.0, + "logps/rejected": -635.0, + "loss": 0.6623, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.80224609375, + "rewards/margins": 2.0390625, + "rewards/rejected": -1.2373046875, + "step": 273 + }, + { + "epoch": 0.054367776179373974, + "grad_norm": 51.55666177094439, + "learning_rate": 5.416666666666666e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.5234375, + "logps/chosen": -692.5, + "logps/rejected": -646.0, + "loss": 0.6785, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.66796875, + "rewards/margins": 2.103515625, + "rewards/rejected": -1.4375, + "step": 274 + }, + { + "epoch": 0.05456619872017461, + "grad_norm": 59.955588470234986, + "learning_rate": 5.436507936507936e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.7734375, + "logps/chosen": -941.0, + "logps/rejected": -1387.0, + "loss": 0.7116, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.37094879150390625, + "rewards/margins": 2.017578125, + "rewards/rejected": -1.646484375, + "step": 275 + }, + { + "epoch": 0.05476462126097525, + "grad_norm": 75.35210048817262, + "learning_rate": 5.456349206349206e-07, + "logits/chosen": 3.6015625, + "logits/rejected": 3.8515625, + "logps/chosen": -859.0, + "logps/rejected": -774.0, + "loss": 0.6701, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.83447265625, + "rewards/margins": 2.259765625, + "rewards/rejected": -1.427734375, + "step": 276 + }, + { + "epoch": 0.054963043801775885, + "grad_norm": 48.705104120867375, + "learning_rate": 5.476190476190477e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.33203125, + "logps/chosen": -1011.0, + "logps/rejected": -623.5, + "loss": 0.6439, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.0654296875, + "rewards/margins": 2.1796875, + "rewards/rejected": -1.1142578125, + "step": 277 + }, + { + "epoch": 0.055161466342576515, + "grad_norm": 47.61340845897722, + "learning_rate": 5.496031746031747e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.4921875, + "logps/chosen": -889.0, + "logps/rejected": -1044.0, + "loss": 0.6385, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.03857421875, + "rewards/margins": 2.90234375, + "rewards/rejected": -1.865234375, + "step": 278 + }, + { + "epoch": 0.05535988888337715, + "grad_norm": 62.6167579758089, + "learning_rate": 5.515873015873016e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.6953125, + "logps/chosen": -860.0, + "logps/rejected": -1470.0, + "loss": 0.778, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.40167236328125, + "rewards/margins": 1.75390625, + "rewards/rejected": -1.3525390625, + "step": 279 + }, + { + "epoch": 0.05555831142417779, + "grad_norm": 64.12359443837353, + "learning_rate": 5.535714285714285e-07, + "logits/chosen": 4.5390625, + "logits/rejected": 4.46875, + "logps/chosen": -1079.5, + "logps/rejected": -803.0, + "loss": 0.6669, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6805419921875, + "rewards/margins": 2.41748046875, + "rewards/rejected": -1.7333984375, + "step": 280 + }, + { + "epoch": 0.05575673396497842, + "grad_norm": 50.273305789675405, + "learning_rate": 5.555555555555555e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.6875, + "logps/chosen": -841.0, + "logps/rejected": -1757.0, + "loss": 0.6253, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7275390625, + "rewards/margins": 3.646484375, + "rewards/rejected": -2.923828125, + "step": 281 + }, + { + "epoch": 0.055955156505779056, + "grad_norm": 59.03752077121945, + "learning_rate": 5.575396825396825e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 4.43359375, + "logps/chosen": -842.0, + "logps/rejected": -952.5, + "loss": 0.6883, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.75390625, + "rewards/margins": 2.001953125, + "rewards/rejected": -1.248779296875, + "step": 282 + }, + { + "epoch": 0.056153579046579694, + "grad_norm": 45.09939098340266, + "learning_rate": 5.595238095238095e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.25390625, + "logps/chosen": -878.0, + "logps/rejected": -563.0, + "loss": 0.6674, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8134765625, + "rewards/margins": 2.091796875, + "rewards/rejected": -1.2803955078125, + "step": 283 + }, + { + "epoch": 0.056352001587380324, + "grad_norm": 58.773541322451344, + "learning_rate": 5.615079365079365e-07, + "logits/chosen": 4.4140625, + "logits/rejected": 4.51953125, + "logps/chosen": -963.0, + "logps/rejected": -634.5, + "loss": 0.6455, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.708984375, + "rewards/margins": 2.23046875, + "rewards/rejected": -1.515625, + "step": 284 + }, + { + "epoch": 0.05655042412818096, + "grad_norm": 52.8510534542938, + "learning_rate": 5.634920634920635e-07, + "logits/chosen": 3.77734375, + "logits/rejected": 3.96875, + "logps/chosen": -852.5, + "logps/rejected": -702.0, + "loss": 0.6433, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.93359375, + "rewards/margins": 2.4453125, + "rewards/rejected": -1.5078125, + "step": 285 + }, + { + "epoch": 0.0567488466689816, + "grad_norm": 56.06291253588141, + "learning_rate": 5.654761904761904e-07, + "logits/chosen": 4.25, + "logits/rejected": 4.234375, + "logps/chosen": -889.0, + "logps/rejected": -1121.0, + "loss": 0.6492, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.751953125, + "rewards/margins": 3.001953125, + "rewards/rejected": -2.2578125, + "step": 286 + }, + { + "epoch": 0.05694726920978223, + "grad_norm": 65.05501980162171, + "learning_rate": 5.674603174603174e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.4296875, + "logps/chosen": -1576.0, + "logps/rejected": -846.0, + "loss": 0.5946, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.647705078125, + "rewards/margins": 2.921875, + "rewards/rejected": -2.275390625, + "step": 287 + }, + { + "epoch": 0.057145691750582865, + "grad_norm": 63.79311096672135, + "learning_rate": 5.694444444444444e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.3203125, + "logps/chosen": -1080.0, + "logps/rejected": -881.0, + "loss": 0.6121, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7275390625, + "rewards/margins": 2.6875, + "rewards/rejected": -1.96044921875, + "step": 288 + }, + { + "epoch": 0.0573441142913835, + "grad_norm": 57.712997516625435, + "learning_rate": 5.714285714285714e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 4.0625, + "logps/chosen": -915.5, + "logps/rejected": -547.75, + "loss": 0.6512, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.69580078125, + "rewards/margins": 2.40234375, + "rewards/rejected": -1.7041015625, + "step": 289 + }, + { + "epoch": 0.05754253683218414, + "grad_norm": 46.466442814986934, + "learning_rate": 5.734126984126984e-07, + "logits/chosen": 4.4765625, + "logits/rejected": 4.7734375, + "logps/chosen": -897.0, + "logps/rejected": -546.5, + "loss": 0.745, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.9638671875, + "rewards/margins": 1.84765625, + "rewards/rejected": -0.8818359375, + "step": 290 + }, + { + "epoch": 0.05774095937298477, + "grad_norm": 51.328721428712605, + "learning_rate": 5.753968253968254e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.7109375, + "logps/chosen": -1093.0, + "logps/rejected": -999.0, + "loss": 0.6926, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.8330078125, + "rewards/margins": 2.4189453125, + "rewards/rejected": -1.58538818359375, + "step": 291 + }, + { + "epoch": 0.05793938191378541, + "grad_norm": 58.69141395638989, + "learning_rate": 5.773809523809523e-07, + "logits/chosen": 4.6484375, + "logits/rejected": 4.6875, + "logps/chosen": -861.0, + "logps/rejected": -579.0, + "loss": 0.6637, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.38671875, + "rewards/margins": 2.333984375, + "rewards/rejected": -1.947265625, + "step": 292 + }, + { + "epoch": 0.058137804454586044, + "grad_norm": 58.307001112873905, + "learning_rate": 5.793650793650794e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.2578125, + "logps/chosen": -869.0, + "logps/rejected": -842.0, + "loss": 0.692, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.76171875, + "rewards/margins": 1.9375, + "rewards/rejected": -1.17578125, + "step": 293 + }, + { + "epoch": 0.058336226995386674, + "grad_norm": 48.27310270784834, + "learning_rate": 5.813492063492064e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.24609375, + "logps/chosen": -875.0, + "logps/rejected": -672.5, + "loss": 0.594, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.88671875, + "rewards/margins": 2.93359375, + "rewards/rejected": -2.04296875, + "step": 294 + }, + { + "epoch": 0.05853464953618731, + "grad_norm": 51.085220599662605, + "learning_rate": 5.833333333333334e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.1640625, + "logps/chosen": -1006.0, + "logps/rejected": -704.0, + "loss": 0.6319, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.1279296875, + "rewards/margins": 2.29296875, + "rewards/rejected": -1.166015625, + "step": 295 + }, + { + "epoch": 0.05873307207698795, + "grad_norm": 59.3880154176649, + "learning_rate": 5.853174603174603e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.3125, + "logps/chosen": -1216.0, + "logps/rejected": -715.0, + "loss": 0.6657, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.81494140625, + "rewards/margins": 2.208984375, + "rewards/rejected": -1.3955078125, + "step": 296 + }, + { + "epoch": 0.05893149461778858, + "grad_norm": 53.23155630932962, + "learning_rate": 5.873015873015873e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.4453125, + "logps/chosen": -1104.0, + "logps/rejected": -747.5, + "loss": 0.5805, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.158203125, + "rewards/margins": 2.8125, + "rewards/rejected": -1.65234375, + "step": 297 + }, + { + "epoch": 0.059129917158589215, + "grad_norm": 54.08005947086061, + "learning_rate": 5.892857142857142e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.3828125, + "logps/chosen": -957.0, + "logps/rejected": -660.0, + "loss": 0.7087, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.415283203125, + "rewards/margins": 3.1953125, + "rewards/rejected": -2.78515625, + "step": 298 + }, + { + "epoch": 0.05932833969938985, + "grad_norm": 50.43975410646839, + "learning_rate": 5.912698412698412e-07, + "logits/chosen": 4.46875, + "logits/rejected": 4.546875, + "logps/chosen": -832.0, + "logps/rejected": -578.5, + "loss": 0.6368, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.1083984375, + "rewards/margins": 2.037109375, + "rewards/rejected": -0.9296875, + "step": 299 + }, + { + "epoch": 0.05952676224019048, + "grad_norm": 47.38528720829546, + "learning_rate": 5.932539682539682e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 4.25, + "logps/chosen": -804.0, + "logps/rejected": -648.5, + "loss": 0.6812, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.984375, + "rewards/margins": 1.83154296875, + "rewards/rejected": -0.84881591796875, + "step": 300 + }, + { + "epoch": 0.05972518478099112, + "grad_norm": 52.16284195636414, + "learning_rate": 5.952380952380952e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.3671875, + "logps/chosen": -1050.0, + "logps/rejected": -789.0, + "loss": 0.5537, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.795562744140625, + "rewards/margins": 3.09765625, + "rewards/rejected": -2.30078125, + "step": 301 + }, + { + "epoch": 0.05992360732179176, + "grad_norm": 51.9055547385947, + "learning_rate": 5.972222222222222e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.14453125, + "logps/chosen": -806.0, + "logps/rejected": -471.0, + "loss": 0.7296, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.64453125, + "rewards/margins": 1.572265625, + "rewards/rejected": -0.92529296875, + "step": 302 + }, + { + "epoch": 0.060122029862592394, + "grad_norm": 60.990851316442814, + "learning_rate": 5.992063492063492e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 4.28125, + "logps/chosen": -965.0, + "logps/rejected": -680.0, + "loss": 0.7254, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.29248046875, + "rewards/margins": 1.9140625, + "rewards/rejected": -1.62109375, + "step": 303 + }, + { + "epoch": 0.060320452403393024, + "grad_norm": 46.71440807046551, + "learning_rate": 6.011904761904761e-07, + "logits/chosen": 4.37109375, + "logits/rejected": 4.48046875, + "logps/chosen": -883.5, + "logps/rejected": -735.0, + "loss": 0.6317, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.90234375, + "rewards/margins": 2.6279296875, + "rewards/rejected": -1.72265625, + "step": 304 + }, + { + "epoch": 0.06051887494419366, + "grad_norm": 61.53568191836612, + "learning_rate": 6.031746031746031e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.28125, + "logps/chosen": -1190.0, + "logps/rejected": -740.0, + "loss": 0.6353, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7744140625, + "rewards/margins": 2.376953125, + "rewards/rejected": -1.6015625, + "step": 305 + }, + { + "epoch": 0.0607172974849943, + "grad_norm": 47.4312341038526, + "learning_rate": 6.051587301587301e-07, + "logits/chosen": 4.875, + "logits/rejected": 4.703125, + "logps/chosen": -493.5, + "logps/rejected": -413.0, + "loss": 0.7781, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.8662109375, + "rewards/margins": 1.306640625, + "rewards/rejected": -0.441162109375, + "step": 306 + }, + { + "epoch": 0.06091572002579493, + "grad_norm": 63.043093308970164, + "learning_rate": 6.071428571428571e-07, + "logits/chosen": 4.3828125, + "logits/rejected": 4.4453125, + "logps/chosen": -882.0, + "logps/rejected": -570.0, + "loss": 0.6514, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8623046875, + "rewards/margins": 2.060546875, + "rewards/rejected": -1.19921875, + "step": 307 + }, + { + "epoch": 0.061114142566595565, + "grad_norm": 50.739942733978076, + "learning_rate": 6.091269841269841e-07, + "logits/chosen": 4.5625, + "logits/rejected": 4.8828125, + "logps/chosen": -816.5, + "logps/rejected": -641.5, + "loss": 0.6243, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.68896484375, + "rewards/margins": 2.333984375, + "rewards/rejected": -1.6484375, + "step": 308 + }, + { + "epoch": 0.0613125651073962, + "grad_norm": 60.806590826063726, + "learning_rate": 6.111111111111112e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 3.828125, + "logps/chosen": -970.0, + "logps/rejected": -513.0, + "loss": 0.5524, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.046875, + "rewards/margins": 2.6875, + "rewards/rejected": -1.640625, + "step": 309 + }, + { + "epoch": 0.06151098764819683, + "grad_norm": 59.39549725416528, + "learning_rate": 6.130952380952381e-07, + "logits/chosen": 4.5078125, + "logits/rejected": 4.484375, + "logps/chosen": -1142.0, + "logps/rejected": -528.5, + "loss": 0.5996, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.212890625, + "rewards/margins": 2.521484375, + "rewards/rejected": -1.3115234375, + "step": 310 + }, + { + "epoch": 0.06170941018899747, + "grad_norm": 50.63910053134024, + "learning_rate": 6.150793650793651e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.578125, + "logps/chosen": -1041.0, + "logps/rejected": -1456.0, + "loss": 0.6054, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.574462890625, + "rewards/margins": 3.212890625, + "rewards/rejected": -2.64453125, + "step": 311 + }, + { + "epoch": 0.06190783272979811, + "grad_norm": 52.35577924286406, + "learning_rate": 6.170634920634921e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.28125, + "logps/chosen": -869.0, + "logps/rejected": -1523.0, + "loss": 0.5633, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.8369140625, + "rewards/margins": 3.806640625, + "rewards/rejected": -2.966796875, + "step": 312 + }, + { + "epoch": 0.06210625527059874, + "grad_norm": 58.78357573224079, + "learning_rate": 6.19047619047619e-07, + "logits/chosen": 4.625, + "logits/rejected": 4.61328125, + "logps/chosen": -1246.0, + "logps/rejected": -708.5, + "loss": 0.5502, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.23046875, + "rewards/margins": 3.04296875, + "rewards/rejected": -1.8046875, + "step": 313 + }, + { + "epoch": 0.062304677811399374, + "grad_norm": 209.46986693041364, + "learning_rate": 6.21031746031746e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.6015625, + "logps/chosen": -955.0, + "logps/rejected": -1770.0, + "loss": 0.6276, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.71337890625, + "rewards/margins": 3.02734375, + "rewards/rejected": -2.3125, + "step": 314 + }, + { + "epoch": 0.06250310035220001, + "grad_norm": 46.47213230196958, + "learning_rate": 6.23015873015873e-07, + "logits/chosen": 4.44140625, + "logits/rejected": 4.4765625, + "logps/chosen": -1175.0, + "logps/rejected": -773.0, + "loss": 0.5642, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.30078125, + "rewards/margins": 2.953125, + "rewards/rejected": -1.65087890625, + "step": 315 + }, + { + "epoch": 0.06270152289300064, + "grad_norm": 46.02695505430581, + "learning_rate": 6.249999999999999e-07, + "logits/chosen": 3.984375, + "logits/rejected": 4.15234375, + "logps/chosen": -1338.0, + "logps/rejected": -916.0, + "loss": 0.5646, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.3564453125, + "rewards/margins": 3.55078125, + "rewards/rejected": -2.19140625, + "step": 316 + }, + { + "epoch": 0.06289994543380129, + "grad_norm": 51.553521293871704, + "learning_rate": 6.269841269841269e-07, + "logits/chosen": 3.76171875, + "logits/rejected": 4.296875, + "logps/chosen": -686.0, + "logps/rejected": -672.5, + "loss": 0.5998, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.82421875, + "rewards/margins": 2.513671875, + "rewards/rejected": -1.693359375, + "step": 317 + }, + { + "epoch": 0.06309836797460192, + "grad_norm": 57.54205889254828, + "learning_rate": 6.289682539682539e-07, + "logits/chosen": 4.703125, + "logits/rejected": 4.85546875, + "logps/chosen": -1350.0, + "logps/rejected": -738.0, + "loss": 0.517, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.998046875, + "rewards/margins": 3.46875, + "rewards/rejected": -2.46875, + "step": 318 + }, + { + "epoch": 0.06329679051540255, + "grad_norm": 46.02602226845135, + "learning_rate": 6.309523809523809e-07, + "logits/chosen": 3.859375, + "logits/rejected": 4.31640625, + "logps/chosen": -983.0, + "logps/rejected": -683.0, + "loss": 0.6387, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.9169921875, + "rewards/margins": 2.51953125, + "rewards/rejected": -1.6004638671875, + "step": 319 + }, + { + "epoch": 0.06349521305620319, + "grad_norm": 52.33507949041158, + "learning_rate": 6.329365079365079e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.890625, + "logps/chosen": -1023.0, + "logps/rejected": -1871.0, + "loss": 0.4833, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.119140625, + "rewards/margins": 5.25390625, + "rewards/rejected": -4.138671875, + "step": 320 + }, + { + "epoch": 0.06369363559700382, + "grad_norm": 48.795042172208085, + "learning_rate": 6.349206349206349e-07, + "logits/chosen": 4.6875, + "logits/rejected": 4.796875, + "logps/chosen": -978.0, + "logps/rejected": -594.5, + "loss": 0.5631, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.93359375, + "rewards/margins": 2.91796875, + "rewards/rejected": -1.982421875, + "step": 321 + }, + { + "epoch": 0.06389205813780445, + "grad_norm": 51.25426625024301, + "learning_rate": 6.369047619047618e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.37109375, + "logps/chosen": -945.0, + "logps/rejected": -788.0, + "loss": 0.6987, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.79638671875, + "rewards/margins": 2.302734375, + "rewards/rejected": -1.509765625, + "step": 322 + }, + { + "epoch": 0.0640904806786051, + "grad_norm": 55.61893867486892, + "learning_rate": 6.388888888888888e-07, + "logits/chosen": 3.76953125, + "logits/rejected": 3.91796875, + "logps/chosen": -815.0, + "logps/rejected": -1170.0, + "loss": 0.6511, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.66015625, + "rewards/margins": 2.5615234375, + "rewards/rejected": -1.904052734375, + "step": 323 + }, + { + "epoch": 0.06428890321940572, + "grad_norm": 55.14106679779641, + "learning_rate": 6.408730158730159e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.375, + "logps/chosen": -1164.0, + "logps/rejected": -805.0, + "loss": 0.5117, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.021484375, + "rewards/margins": 3.4609375, + "rewards/rejected": -2.4375, + "step": 324 + }, + { + "epoch": 0.06448732576020635, + "grad_norm": 50.36256917458664, + "learning_rate": 6.428571428571429e-07, + "logits/chosen": 3.984375, + "logits/rejected": 4.12109375, + "logps/chosen": -1082.0, + "logps/rejected": -754.0, + "loss": 0.6762, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.9951171875, + "rewards/margins": 2.23046875, + "rewards/rejected": -1.23583984375, + "step": 325 + }, + { + "epoch": 0.064685748301007, + "grad_norm": 46.762715796907706, + "learning_rate": 6.448412698412699e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.34375, + "logps/chosen": -1031.0, + "logps/rejected": -662.0, + "loss": 0.6454, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9736328125, + "rewards/margins": 2.44873046875, + "rewards/rejected": -1.4755859375, + "step": 326 + }, + { + "epoch": 0.06488417084180763, + "grad_norm": 50.81692146641719, + "learning_rate": 6.468253968253969e-07, + "logits/chosen": 3.6328125, + "logits/rejected": 4.0, + "logps/chosen": -1212.0, + "logps/rejected": -744.0, + "loss": 0.4937, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.92919921875, + "rewards/margins": 3.734375, + "rewards/rejected": -2.80859375, + "step": 327 + }, + { + "epoch": 0.06508259338260826, + "grad_norm": 54.85004161060237, + "learning_rate": 6.488095238095239e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.3984375, + "logps/chosen": -966.0, + "logps/rejected": -769.0, + "loss": 0.5858, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.876953125, + "rewards/margins": 2.4921875, + "rewards/rejected": -1.615234375, + "step": 328 + }, + { + "epoch": 0.0652810159234089, + "grad_norm": 46.6979940738689, + "learning_rate": 6.507936507936507e-07, + "logits/chosen": 3.8046875, + "logits/rejected": 3.94140625, + "logps/chosen": -747.0, + "logps/rejected": -460.5, + "loss": 0.659, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.5654296875, + "rewards/margins": 1.953125, + "rewards/rejected": -1.38671875, + "step": 329 + }, + { + "epoch": 0.06547943846420953, + "grad_norm": 45.45554931043243, + "learning_rate": 6.527777777777777e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 4.41015625, + "logps/chosen": -734.0, + "logps/rejected": -699.0, + "loss": 0.6521, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.60009765625, + "rewards/margins": 5.6953125, + "rewards/rejected": -5.095703125, + "step": 330 + }, + { + "epoch": 0.06567786100501016, + "grad_norm": 54.00355114359477, + "learning_rate": 6.547619047619047e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.26171875, + "logps/chosen": -945.0, + "logps/rejected": -749.5, + "loss": 0.6465, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.0869140625, + "rewards/margins": 2.29296875, + "rewards/rejected": -1.202484130859375, + "step": 331 + }, + { + "epoch": 0.06587628354581081, + "grad_norm": 56.56474994665108, + "learning_rate": 6.567460317460317e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 4.65625, + "logps/chosen": -1056.0, + "logps/rejected": -1037.5, + "loss": 0.605, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.82421875, + "rewards/margins": 3.05859375, + "rewards/rejected": -2.23828125, + "step": 332 + }, + { + "epoch": 0.06607470608661144, + "grad_norm": 53.62825129772964, + "learning_rate": 6.587301587301587e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.12890625, + "logps/chosen": -1197.0, + "logps/rejected": -820.0, + "loss": 0.673, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.92822265625, + "rewards/margins": 2.380859375, + "rewards/rejected": -1.4580078125, + "step": 333 + }, + { + "epoch": 0.06627312862741207, + "grad_norm": 47.31956699094805, + "learning_rate": 6.607142857142857e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 4.0546875, + "logps/chosen": -739.0, + "logps/rejected": -594.5, + "loss": 0.6416, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.1552734375, + "rewards/margins": 2.1875, + "rewards/rejected": -1.03125, + "step": 334 + }, + { + "epoch": 0.06647155116821271, + "grad_norm": 50.87051222830231, + "learning_rate": 6.626984126984126e-07, + "logits/chosen": 3.96875, + "logits/rejected": 4.26171875, + "logps/chosen": -1326.0, + "logps/rejected": -893.0, + "loss": 0.4884, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.0322265625, + "rewards/margins": 3.7578125, + "rewards/rejected": -2.73046875, + "step": 335 + }, + { + "epoch": 0.06666997370901334, + "grad_norm": 55.87042040393185, + "learning_rate": 6.646825396825396e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.578125, + "logps/chosen": -1242.0, + "logps/rejected": -1112.0, + "loss": 0.5683, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.1650390625, + "rewards/margins": 4.34765625, + "rewards/rejected": -3.18359375, + "step": 336 + }, + { + "epoch": 0.06686839624981399, + "grad_norm": 53.73815826749983, + "learning_rate": 6.666666666666666e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.3046875, + "logps/chosen": -1141.0, + "logps/rejected": -757.0, + "loss": 0.5996, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.9326171875, + "rewards/margins": 2.654296875, + "rewards/rejected": -1.7236328125, + "step": 337 + }, + { + "epoch": 0.06706681879061462, + "grad_norm": 53.254541943938875, + "learning_rate": 6.686507936507936e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.24609375, + "logps/chosen": -1205.0, + "logps/rejected": -689.0, + "loss": 0.5882, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.0146484375, + "rewards/margins": 2.6328125, + "rewards/rejected": -1.6220703125, + "step": 338 + }, + { + "epoch": 0.06726524133141525, + "grad_norm": 53.57249350141192, + "learning_rate": 6.706349206349206e-07, + "logits/chosen": 3.796875, + "logits/rejected": 3.97265625, + "logps/chosen": -1241.0, + "logps/rejected": -721.5, + "loss": 0.5491, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1611328125, + "rewards/margins": 3.259765625, + "rewards/rejected": -2.095703125, + "step": 339 + }, + { + "epoch": 0.06746366387221589, + "grad_norm": 56.35318376566995, + "learning_rate": 6.726190476190477e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.33984375, + "logps/chosen": -1227.0, + "logps/rejected": -619.0, + "loss": 0.5196, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.1376953125, + "rewards/margins": 2.8671875, + "rewards/rejected": -1.728515625, + "step": 340 + }, + { + "epoch": 0.06766208641301652, + "grad_norm": 57.27090295434903, + "learning_rate": 6.746031746031746e-07, + "logits/chosen": 4.5625, + "logits/rejected": 4.609375, + "logps/chosen": -829.0, + "logps/rejected": -1189.5, + "loss": 0.5945, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.7705078125, + "rewards/margins": 2.89453125, + "rewards/rejected": -2.119140625, + "step": 341 + }, + { + "epoch": 0.06786050895381715, + "grad_norm": 44.02321304225732, + "learning_rate": 6.765873015873016e-07, + "logits/chosen": 4.4140625, + "logits/rejected": 4.453125, + "logps/chosen": -983.5, + "logps/rejected": -799.0, + "loss": 0.6698, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.546875, + "rewards/margins": 2.1044921875, + "rewards/rejected": -1.556640625, + "step": 342 + }, + { + "epoch": 0.0680589314946178, + "grad_norm": 53.454348655478306, + "learning_rate": 6.785714285714286e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.2578125, + "logps/chosen": -819.0, + "logps/rejected": -688.0, + "loss": 0.6798, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.395263671875, + "rewards/margins": 2.287109375, + "rewards/rejected": -1.888671875, + "step": 343 + }, + { + "epoch": 0.06825735403541842, + "grad_norm": 57.526116643277206, + "learning_rate": 6.805555555555556e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.32421875, + "logps/chosen": -901.5, + "logps/rejected": -717.5, + "loss": 0.6781, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.67822265625, + "rewards/margins": 2.1640625, + "rewards/rejected": -1.482421875, + "step": 344 + }, + { + "epoch": 0.06845577657621905, + "grad_norm": 51.363443255807375, + "learning_rate": 6.825396825396826e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.4453125, + "logps/chosen": -843.0, + "logps/rejected": -655.5, + "loss": 0.6315, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.955352783203125, + "rewards/margins": 2.4140625, + "rewards/rejected": -1.45703125, + "step": 345 + }, + { + "epoch": 0.0686541991170197, + "grad_norm": 50.145920667201004, + "learning_rate": 6.845238095238095e-07, + "logits/chosen": 3.671875, + "logits/rejected": 4.23828125, + "logps/chosen": -1134.0, + "logps/rejected": -715.5, + "loss": 0.5597, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.1123046875, + "rewards/margins": 3.11328125, + "rewards/rejected": -2.001953125, + "step": 346 + }, + { + "epoch": 0.06885262165782033, + "grad_norm": 73.49030573456704, + "learning_rate": 6.865079365079364e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 4.29296875, + "logps/chosen": -1234.5, + "logps/rejected": -871.5, + "loss": 0.512, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.658203125, + "rewards/margins": 3.69140625, + "rewards/rejected": -2.03125, + "step": 347 + }, + { + "epoch": 0.06905104419862096, + "grad_norm": 56.574543775314766, + "learning_rate": 6.884920634920634e-07, + "logits/chosen": 4.328125, + "logits/rejected": 4.40625, + "logps/chosen": -1180.0, + "logps/rejected": -1560.0, + "loss": 0.6428, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.06201171875, + "rewards/margins": 3.259765625, + "rewards/rejected": -2.1953125, + "step": 348 + }, + { + "epoch": 0.0692494667394216, + "grad_norm": 47.43853269782176, + "learning_rate": 6.904761904761904e-07, + "logits/chosen": 4.4140625, + "logits/rejected": 4.5546875, + "logps/chosen": -768.0, + "logps/rejected": -440.5, + "loss": 0.5994, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.87255859375, + "rewards/margins": 2.5234375, + "rewards/rejected": -1.65234375, + "step": 349 + }, + { + "epoch": 0.06944788928022223, + "grad_norm": 50.02031774664181, + "learning_rate": 6.924603174603174e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.5546875, + "logps/chosen": -707.0, + "logps/rejected": -626.0, + "loss": 0.7085, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.5511474609375, + "rewards/margins": 2.087890625, + "rewards/rejected": -1.5390625, + "step": 350 + }, + { + "epoch": 0.06964631182102286, + "grad_norm": 51.406269205821886, + "learning_rate": 6.944444444444444e-07, + "logits/chosen": 4.4453125, + "logits/rejected": 4.203125, + "logps/chosen": -1098.0, + "logps/rejected": -542.5, + "loss": 0.5945, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.26953125, + "rewards/margins": 2.568359375, + "rewards/rejected": -1.2998046875, + "step": 351 + }, + { + "epoch": 0.06984473436182351, + "grad_norm": 48.1779336676668, + "learning_rate": 6.964285714285714e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 3.99609375, + "logps/chosen": -1261.0, + "logps/rejected": -748.0, + "loss": 0.5491, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.2509765625, + "rewards/margins": 3.16796875, + "rewards/rejected": -1.916015625, + "step": 352 + }, + { + "epoch": 0.07004315690262414, + "grad_norm": 53.258773506087685, + "learning_rate": 6.984126984126983e-07, + "logits/chosen": 4.39453125, + "logits/rejected": 4.15234375, + "logps/chosen": -1277.0, + "logps/rejected": -733.0, + "loss": 0.504, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.21484375, + "rewards/margins": 3.0546875, + "rewards/rejected": -1.8427734375, + "step": 353 + }, + { + "epoch": 0.07024157944342477, + "grad_norm": 70.01142683752839, + "learning_rate": 7.003968253968253e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.453125, + "logps/chosen": -1263.0, + "logps/rejected": -714.0, + "loss": 0.6295, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.505615234375, + "rewards/margins": 2.453125, + "rewards/rejected": -1.94921875, + "step": 354 + }, + { + "epoch": 0.07044000198422541, + "grad_norm": 49.81054636803302, + "learning_rate": 7.023809523809523e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.4921875, + "logps/chosen": -695.0, + "logps/rejected": -1311.5, + "loss": 0.579, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8570556640625, + "rewards/margins": 3.76953125, + "rewards/rejected": -2.9150390625, + "step": 355 + }, + { + "epoch": 0.07063842452502604, + "grad_norm": 59.1091956199372, + "learning_rate": 7.043650793650794e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.12890625, + "logps/chosen": -1306.0, + "logps/rejected": -638.0, + "loss": 0.5435, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.01953125, + "rewards/margins": 3.255859375, + "rewards/rejected": -2.236328125, + "step": 356 + }, + { + "epoch": 0.07083684706582667, + "grad_norm": 55.176752485723064, + "learning_rate": 7.063492063492064e-07, + "logits/chosen": 3.4453125, + "logits/rejected": 3.8984375, + "logps/chosen": -656.0, + "logps/rejected": -1882.0, + "loss": 0.6689, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.72119140625, + "rewards/margins": 4.173828125, + "rewards/rejected": -3.4443359375, + "step": 357 + }, + { + "epoch": 0.07103526960662732, + "grad_norm": 57.88303324492041, + "learning_rate": 7.083333333333334e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 4.1796875, + "logps/chosen": -1222.0, + "logps/rejected": -869.0, + "loss": 0.6054, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.8134765625, + "rewards/margins": 3.02734375, + "rewards/rejected": -2.21875, + "step": 358 + }, + { + "epoch": 0.07123369214742795, + "grad_norm": 50.582002987543625, + "learning_rate": 7.103174603174603e-07, + "logits/chosen": 4.33984375, + "logits/rejected": 4.44140625, + "logps/chosen": -1152.0, + "logps/rejected": -827.0, + "loss": 0.5957, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.00830078125, + "rewards/margins": 3.451171875, + "rewards/rejected": -2.443359375, + "step": 359 + }, + { + "epoch": 0.07143211468822858, + "grad_norm": 46.442764641935334, + "learning_rate": 7.123015873015873e-07, + "logits/chosen": 4.4375, + "logits/rejected": 4.75, + "logps/chosen": -946.0, + "logps/rejected": -799.0, + "loss": 0.5745, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.2265625, + "rewards/margins": 2.708984375, + "rewards/rejected": -1.48046875, + "step": 360 + }, + { + "epoch": 0.07163053722902922, + "grad_norm": 45.45421800592703, + "learning_rate": 7.142857142857143e-07, + "logits/chosen": 4.734375, + "logits/rejected": 4.8125, + "logps/chosen": -1050.0, + "logps/rejected": -676.0, + "loss": 0.6107, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.37109375, + "rewards/margins": 2.509765625, + "rewards/rejected": -1.139404296875, + "step": 361 + }, + { + "epoch": 0.07182895976982985, + "grad_norm": 51.26385005685058, + "learning_rate": 7.162698412698413e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 3.890625, + "logps/chosen": -1217.0, + "logps/rejected": -729.0, + "loss": 0.5431, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.8310546875, + "rewards/margins": 3.51171875, + "rewards/rejected": -2.68359375, + "step": 362 + }, + { + "epoch": 0.0720273823106305, + "grad_norm": 46.909808836461615, + "learning_rate": 7.182539682539682e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 4.27734375, + "logps/chosen": -1222.0, + "logps/rejected": -1405.0, + "loss": 0.5282, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8349609375, + "rewards/margins": 3.828125, + "rewards/rejected": -2.9921875, + "step": 363 + }, + { + "epoch": 0.07222580485143112, + "grad_norm": 43.875111372719445, + "learning_rate": 7.202380952380952e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.3046875, + "logps/chosen": -789.0, + "logps/rejected": -559.0, + "loss": 0.5429, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.84814453125, + "rewards/margins": 2.90625, + "rewards/rejected": -2.060546875, + "step": 364 + }, + { + "epoch": 0.07242422739223175, + "grad_norm": 51.10071662975572, + "learning_rate": 7.222222222222221e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.22265625, + "logps/chosen": -1063.0, + "logps/rejected": -649.0, + "loss": 0.5645, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8916015625, + "rewards/margins": 3.36328125, + "rewards/rejected": -2.474609375, + "step": 365 + }, + { + "epoch": 0.0726226499330324, + "grad_norm": 52.69435112094249, + "learning_rate": 7.242063492063491e-07, + "logits/chosen": 3.953125, + "logits/rejected": 4.26171875, + "logps/chosen": -928.5, + "logps/rejected": -608.0, + "loss": 0.6312, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.9306640625, + "rewards/margins": 2.595703125, + "rewards/rejected": -1.6650390625, + "step": 366 + }, + { + "epoch": 0.07282107247383303, + "grad_norm": 60.69788207328555, + "learning_rate": 7.261904761904761e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.17578125, + "logps/chosen": -1112.0, + "logps/rejected": -808.0, + "loss": 0.5937, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.6640625, + "rewards/margins": 3.171875, + "rewards/rejected": -2.5078125, + "step": 367 + }, + { + "epoch": 0.07301949501463366, + "grad_norm": 45.89227623559167, + "learning_rate": 7.281746031746031e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.5078125, + "logps/chosen": -1054.0, + "logps/rejected": -689.0, + "loss": 0.7775, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.435546875, + "rewards/margins": 1.548828125, + "rewards/rejected": -1.11181640625, + "step": 368 + }, + { + "epoch": 0.0732179175554343, + "grad_norm": 51.92687583591863, + "learning_rate": 7.301587301587301e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.296875, + "logps/chosen": -811.0, + "logps/rejected": -565.0, + "loss": 0.6876, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.3724365234375, + "rewards/margins": 1.93408203125, + "rewards/rejected": -1.5615234375, + "step": 369 + }, + { + "epoch": 0.07341634009623493, + "grad_norm": 48.82359277613043, + "learning_rate": 7.321428571428571e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 4.328125, + "logps/chosen": -780.0, + "logps/rejected": -423.0, + "loss": 0.6658, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.001953125, + "rewards/margins": 2.080078125, + "rewards/rejected": -1.0791015625, + "step": 370 + }, + { + "epoch": 0.07361476263703556, + "grad_norm": 41.05498240382638, + "learning_rate": 7.34126984126984e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.33984375, + "logps/chosen": -999.0, + "logps/rejected": -1373.75, + "loss": 0.5225, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.1962890625, + "rewards/margins": 3.97265625, + "rewards/rejected": -2.771484375, + "step": 371 + }, + { + "epoch": 0.07381318517783621, + "grad_norm": 53.73513792392258, + "learning_rate": 7.361111111111111e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 4.01171875, + "logps/chosen": -928.0, + "logps/rejected": -686.0, + "loss": 0.5272, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.38671875, + "rewards/margins": 8.36328125, + "rewards/rejected": -6.9921875, + "step": 372 + }, + { + "epoch": 0.07401160771863684, + "grad_norm": 47.250722915287035, + "learning_rate": 7.380952380952381e-07, + "logits/chosen": 4.7578125, + "logits/rejected": 4.6171875, + "logps/chosen": -612.0, + "logps/rejected": -671.0, + "loss": 0.6856, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.1669921875, + "rewards/margins": 2.1298828125, + "rewards/rejected": -0.96533203125, + "step": 373 + }, + { + "epoch": 0.07421003025943747, + "grad_norm": 43.213309811831124, + "learning_rate": 7.400793650793651e-07, + "logits/chosen": 4.19921875, + "logits/rejected": 4.36328125, + "logps/chosen": -659.0, + "logps/rejected": -554.5, + "loss": 0.6344, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.736572265625, + "rewards/margins": 2.23046875, + "rewards/rejected": -1.49267578125, + "step": 374 + }, + { + "epoch": 0.07440845280023811, + "grad_norm": 49.04502635606486, + "learning_rate": 7.420634920634921e-07, + "logits/chosen": 4.3828125, + "logits/rejected": 4.34375, + "logps/chosen": -839.0, + "logps/rejected": -533.5, + "loss": 0.7144, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.935546875, + "rewards/margins": 1.8056640625, + "rewards/rejected": -0.86962890625, + "step": 375 + }, + { + "epoch": 0.07460687534103874, + "grad_norm": 44.79759846722711, + "learning_rate": 7.440476190476191e-07, + "logits/chosen": 4.43359375, + "logits/rejected": 4.8046875, + "logps/chosen": -785.5, + "logps/rejected": -632.0, + "loss": 0.6202, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.0625, + "rewards/margins": 2.8671875, + "rewards/rejected": -1.806640625, + "step": 376 + }, + { + "epoch": 0.07480529788183937, + "grad_norm": 54.65946918873532, + "learning_rate": 7.46031746031746e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 4.234375, + "logps/chosen": -1004.0, + "logps/rejected": -1300.0, + "loss": 0.5655, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.0419921875, + "rewards/margins": 3.484375, + "rewards/rejected": -2.443359375, + "step": 377 + }, + { + "epoch": 0.07500372042264002, + "grad_norm": 61.10027416184383, + "learning_rate": 7.48015873015873e-07, + "logits/chosen": 4.3828125, + "logits/rejected": 4.13671875, + "logps/chosen": -1110.0, + "logps/rejected": -805.0, + "loss": 0.6498, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.80078125, + "rewards/margins": 2.337890625, + "rewards/rejected": -1.5361328125, + "step": 378 + }, + { + "epoch": 0.07520214296344065, + "grad_norm": 48.2121035485959, + "learning_rate": 7.5e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.5234375, + "logps/chosen": -939.5, + "logps/rejected": -664.0, + "loss": 0.5707, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6787109375, + "rewards/margins": 2.923828125, + "rewards/rejected": -2.2421875, + "step": 379 + }, + { + "epoch": 0.07540056550424128, + "grad_norm": 42.043624711679406, + "learning_rate": 7.519841269841269e-07, + "logits/chosen": 4.6875, + "logits/rejected": 4.7890625, + "logps/chosen": -1491.0, + "logps/rejected": -805.0, + "loss": 0.4946, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.265625, + "rewards/margins": 4.6484375, + "rewards/rejected": -3.38671875, + "step": 380 + }, + { + "epoch": 0.07559898804504192, + "grad_norm": 52.80946554890782, + "learning_rate": 7.539682539682539e-07, + "logits/chosen": 4.51171875, + "logits/rejected": 4.49609375, + "logps/chosen": -1137.0, + "logps/rejected": -681.0, + "loss": 0.5671, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.0693359375, + "rewards/margins": 3.30078125, + "rewards/rejected": -2.232421875, + "step": 381 + }, + { + "epoch": 0.07579741058584255, + "grad_norm": 65.05580400517938, + "learning_rate": 7.559523809523809e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.5390625, + "logps/chosen": -1329.0, + "logps/rejected": -785.0, + "loss": 0.6095, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.896484375, + "rewards/margins": 2.9140625, + "rewards/rejected": -2.015625, + "step": 382 + }, + { + "epoch": 0.07599583312664318, + "grad_norm": 63.13561550507374, + "learning_rate": 7.579365079365078e-07, + "logits/chosen": 3.828125, + "logits/rejected": 4.0625, + "logps/chosen": -1091.0, + "logps/rejected": -923.0, + "loss": 0.6353, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.533203125, + "rewards/margins": 2.85546875, + "rewards/rejected": -2.330078125, + "step": 383 + }, + { + "epoch": 0.07619425566744382, + "grad_norm": 48.151440952985524, + "learning_rate": 7.599206349206348e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 4.36328125, + "logps/chosen": -914.0, + "logps/rejected": -602.0, + "loss": 0.5052, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.078125, + "rewards/margins": 3.4140625, + "rewards/rejected": -2.337890625, + "step": 384 + }, + { + "epoch": 0.07639267820824445, + "grad_norm": 49.65428828293292, + "learning_rate": 7.619047619047618e-07, + "logits/chosen": 3.78125, + "logits/rejected": 4.1015625, + "logps/chosen": -893.0, + "logps/rejected": -558.0, + "loss": 0.588, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.95361328125, + "rewards/margins": 2.68359375, + "rewards/rejected": -1.7265625, + "step": 385 + }, + { + "epoch": 0.07659110074904509, + "grad_norm": 45.68068601403133, + "learning_rate": 7.638888888888888e-07, + "logits/chosen": 4.015625, + "logits/rejected": 4.34375, + "logps/chosen": -884.0, + "logps/rejected": -638.0, + "loss": 0.69, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.91162109375, + "rewards/margins": 2.39453125, + "rewards/rejected": -1.48828125, + "step": 386 + }, + { + "epoch": 0.07678952328984573, + "grad_norm": 52.386053569256994, + "learning_rate": 7.658730158730159e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 4.34765625, + "logps/chosen": -1282.0, + "logps/rejected": -849.0, + "loss": 0.539, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.81201171875, + "rewards/margins": 6.9296875, + "rewards/rejected": -6.1171875, + "step": 387 + }, + { + "epoch": 0.07698794583064636, + "grad_norm": 43.80347341754957, + "learning_rate": 7.678571428571429e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.23046875, + "logps/chosen": -873.5, + "logps/rejected": -670.5, + "loss": 0.5645, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.80078125, + "rewards/margins": 2.796875, + "rewards/rejected": -1.994140625, + "step": 388 + }, + { + "epoch": 0.07718636837144699, + "grad_norm": 49.036187502789076, + "learning_rate": 7.698412698412699e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.40625, + "logps/chosen": -814.0, + "logps/rejected": -649.0, + "loss": 0.5575, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.0439453125, + "rewards/margins": 2.69140625, + "rewards/rejected": -1.65234375, + "step": 389 + }, + { + "epoch": 0.07738479091224763, + "grad_norm": 73.57228116370335, + "learning_rate": 7.718253968253968e-07, + "logits/chosen": 3.984375, + "logits/rejected": 4.31640625, + "logps/chosen": -1052.0, + "logps/rejected": -787.0, + "loss": 0.5115, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.5458984375, + "rewards/margins": 3.41796875, + "rewards/rejected": -1.8798828125, + "step": 390 + }, + { + "epoch": 0.07758321345304826, + "grad_norm": 44.84451341774657, + "learning_rate": 7.738095238095238e-07, + "logits/chosen": 4.59765625, + "logits/rejected": 4.7734375, + "logps/chosen": -926.0, + "logps/rejected": -1497.0, + "loss": 0.5099, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.08740234375, + "rewards/margins": 4.015625, + "rewards/rejected": -2.93359375, + "step": 391 + }, + { + "epoch": 0.07778163599384891, + "grad_norm": 47.39396989975399, + "learning_rate": 7.757936507936508e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 4.44921875, + "logps/chosen": -1044.0, + "logps/rejected": -816.0, + "loss": 0.4936, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.048828125, + "rewards/margins": 7.11328125, + "rewards/rejected": -6.06640625, + "step": 392 + }, + { + "epoch": 0.07798005853464954, + "grad_norm": 56.420021209659126, + "learning_rate": 7.777777777777778e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.359375, + "logps/chosen": -1416.0, + "logps/rejected": -824.0, + "loss": 0.4713, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.505859375, + "rewards/margins": 3.7578125, + "rewards/rejected": -2.25390625, + "step": 393 + }, + { + "epoch": 0.07817848107545017, + "grad_norm": 43.98111321050767, + "learning_rate": 7.797619047619048e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 4.08984375, + "logps/chosen": -915.0, + "logps/rejected": -657.5, + "loss": 0.5397, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9349365234375, + "rewards/margins": 3.310546875, + "rewards/rejected": -2.376953125, + "step": 394 + }, + { + "epoch": 0.07837690361625081, + "grad_norm": 49.752110303651534, + "learning_rate": 7.817460317460318e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.2578125, + "logps/chosen": -1011.0, + "logps/rejected": -882.5, + "loss": 0.5881, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.845703125, + "rewards/margins": 3.189453125, + "rewards/rejected": -2.345703125, + "step": 395 + }, + { + "epoch": 0.07857532615705144, + "grad_norm": 41.17902111557806, + "learning_rate": 7.837301587301586e-07, + "logits/chosen": 4.671875, + "logits/rejected": 4.7109375, + "logps/chosen": -1028.0, + "logps/rejected": -591.5, + "loss": 0.4765, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.240234375, + "rewards/margins": 3.64453125, + "rewards/rejected": -2.396484375, + "step": 396 + }, + { + "epoch": 0.07877374869785207, + "grad_norm": 45.25064532189423, + "learning_rate": 7.857142857142856e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 4.23046875, + "logps/chosen": -949.0, + "logps/rejected": -574.0, + "loss": 0.5042, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.30078125, + "rewards/margins": 3.7421875, + "rewards/rejected": -2.447265625, + "step": 397 + }, + { + "epoch": 0.07897217123865272, + "grad_norm": 46.40674330735161, + "learning_rate": 7.876984126984126e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.59375, + "logps/chosen": -854.0, + "logps/rejected": -1436.5, + "loss": 0.5469, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.80712890625, + "rewards/margins": 4.62109375, + "rewards/rejected": -3.80859375, + "step": 398 + }, + { + "epoch": 0.07917059377945335, + "grad_norm": 49.03083506753916, + "learning_rate": 7.896825396825396e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.2890625, + "logps/chosen": -848.0, + "logps/rejected": -429.5, + "loss": 0.5597, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8876953125, + "rewards/margins": 2.76953125, + "rewards/rejected": -1.884765625, + "step": 399 + }, + { + "epoch": 0.07936901632025398, + "grad_norm": 46.69940114230435, + "learning_rate": 7.916666666666666e-07, + "logits/chosen": 4.56640625, + "logits/rejected": 4.703125, + "logps/chosen": -1100.0, + "logps/rejected": -572.5, + "loss": 0.6684, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.80810546875, + "rewards/margins": 2.423828125, + "rewards/rejected": -1.619140625, + "step": 400 + }, + { + "epoch": 0.07956743886105462, + "grad_norm": 45.968111761275345, + "learning_rate": 7.936507936507936e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 4.1875, + "logps/chosen": -1145.0, + "logps/rejected": -1449.0, + "loss": 0.6532, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.4765625, + "rewards/margins": 2.87841796875, + "rewards/rejected": -2.3955078125, + "step": 401 + }, + { + "epoch": 0.07976586140185525, + "grad_norm": 49.3130545976194, + "learning_rate": 7.956349206349205e-07, + "logits/chosen": 4.078125, + "logits/rejected": 4.296875, + "logps/chosen": -1156.5, + "logps/rejected": -699.5, + "loss": 0.5546, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.0234375, + "rewards/margins": 3.296875, + "rewards/rejected": -2.275390625, + "step": 402 + }, + { + "epoch": 0.07996428394265588, + "grad_norm": 102.57881498853406, + "learning_rate": 7.976190476190476e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.55078125, + "logps/chosen": -1282.0, + "logps/rejected": -860.5, + "loss": 0.488, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.220703125, + "rewards/margins": 4.265625, + "rewards/rejected": -3.046875, + "step": 403 + }, + { + "epoch": 0.08016270648345653, + "grad_norm": 44.965453130395325, + "learning_rate": 7.996031746031746e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.05859375, + "logps/chosen": -1227.0, + "logps/rejected": -778.5, + "loss": 0.4818, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.64453125, + "rewards/margins": 3.8359375, + "rewards/rejected": -2.19921875, + "step": 404 + }, + { + "epoch": 0.08036112902425716, + "grad_norm": 47.04454119093383, + "learning_rate": 8.015873015873016e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 3.9453125, + "logps/chosen": -931.0, + "logps/rejected": -483.0, + "loss": 0.5768, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.21484375, + "rewards/margins": 2.693359375, + "rewards/rejected": -1.4794921875, + "step": 405 + }, + { + "epoch": 0.08055955156505779, + "grad_norm": 43.61450017623783, + "learning_rate": 8.035714285714286e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.40234375, + "logps/chosen": -1069.0, + "logps/rejected": -678.0, + "loss": 0.4992, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.3984375, + "rewards/margins": 3.72265625, + "rewards/rejected": -2.31640625, + "step": 406 + }, + { + "epoch": 0.08075797410585843, + "grad_norm": 46.39412081304887, + "learning_rate": 8.055555555555556e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.05859375, + "logps/chosen": -828.0, + "logps/rejected": -644.0, + "loss": 0.7253, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.623046875, + "rewards/margins": 1.947265625, + "rewards/rejected": -1.32421875, + "step": 407 + }, + { + "epoch": 0.08095639664665906, + "grad_norm": 48.95984964699793, + "learning_rate": 8.075396825396825e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.3828125, + "logps/chosen": -1343.0, + "logps/rejected": -1077.0, + "loss": 0.4689, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.03515625, + "rewards/margins": 4.859375, + "rewards/rejected": -3.828125, + "step": 408 + }, + { + "epoch": 0.08115481918745969, + "grad_norm": 46.698925454769146, + "learning_rate": 8.095238095238095e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.11328125, + "logps/chosen": -1113.0, + "logps/rejected": -681.5, + "loss": 0.487, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.24609375, + "rewards/margins": 3.81640625, + "rewards/rejected": -2.576171875, + "step": 409 + }, + { + "epoch": 0.08135324172826033, + "grad_norm": 37.45344384293144, + "learning_rate": 8.115079365079365e-07, + "logits/chosen": 3.74609375, + "logits/rejected": 3.94140625, + "logps/chosen": -881.0, + "logps/rejected": -624.0, + "loss": 0.5084, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.228515625, + "rewards/margins": 3.46484375, + "rewards/rejected": -2.2373046875, + "step": 410 + }, + { + "epoch": 0.08155166426906096, + "grad_norm": 57.58876369910785, + "learning_rate": 8.134920634920635e-07, + "logits/chosen": 3.16015625, + "logits/rejected": 3.39453125, + "logps/chosen": -1010.0, + "logps/rejected": -815.5, + "loss": 0.643, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.889892578125, + "rewards/margins": 2.564453125, + "rewards/rejected": -1.67578125, + "step": 411 + }, + { + "epoch": 0.0817500868098616, + "grad_norm": 48.90746759401974, + "learning_rate": 8.154761904761905e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.34765625, + "logps/chosen": -799.0, + "logps/rejected": -608.5, + "loss": 0.5691, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.756103515625, + "rewards/margins": 2.796875, + "rewards/rejected": -2.04296875, + "step": 412 + }, + { + "epoch": 0.08194850935066224, + "grad_norm": 42.90466254850268, + "learning_rate": 8.174603174603174e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.35546875, + "logps/chosen": -928.0, + "logps/rejected": -714.0, + "loss": 0.5276, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.3857421875, + "rewards/margins": 3.296875, + "rewards/rejected": -1.908203125, + "step": 413 + }, + { + "epoch": 0.08214693189146287, + "grad_norm": 53.751723998993114, + "learning_rate": 8.194444444444443e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 4.3125, + "logps/chosen": -1009.0, + "logps/rejected": -791.0, + "loss": 0.5616, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.4775390625, + "rewards/margins": 3.37109375, + "rewards/rejected": -1.892578125, + "step": 414 + }, + { + "epoch": 0.0823453544322635, + "grad_norm": 46.373942872597304, + "learning_rate": 8.214285714285713e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 4.421875, + "logps/chosen": -1242.0, + "logps/rejected": -869.0, + "loss": 0.5052, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.3154296875, + "rewards/margins": 3.703125, + "rewards/rejected": -2.390625, + "step": 415 + }, + { + "epoch": 0.08254377697306414, + "grad_norm": 49.471368732271344, + "learning_rate": 8.234126984126983e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.15625, + "logps/chosen": -857.5, + "logps/rejected": -443.25, + "loss": 0.7575, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.830078125, + "rewards/margins": 1.5517578125, + "rewards/rejected": -0.720703125, + "step": 416 + }, + { + "epoch": 0.08274219951386477, + "grad_norm": 38.589875710849526, + "learning_rate": 8.253968253968253e-07, + "logits/chosen": 4.19921875, + "logits/rejected": 4.390625, + "logps/chosen": -905.5, + "logps/rejected": -805.0, + "loss": 0.5547, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.416015625, + "rewards/margins": 3.47265625, + "rewards/rejected": -2.05859375, + "step": 417 + }, + { + "epoch": 0.08294062205466542, + "grad_norm": 49.068519615407574, + "learning_rate": 8.273809523809523e-07, + "logits/chosen": 4.453125, + "logits/rejected": 4.44140625, + "logps/chosen": -932.0, + "logps/rejected": -735.0, + "loss": 0.549, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1494140625, + "rewards/margins": 3.21484375, + "rewards/rejected": -2.0703125, + "step": 418 + }, + { + "epoch": 0.08313904459546605, + "grad_norm": 49.55789938543452, + "learning_rate": 8.293650793650794e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 3.796875, + "logps/chosen": -1168.0, + "logps/rejected": -1586.0, + "loss": 0.6377, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.8935546875, + "rewards/margins": 3.48046875, + "rewards/rejected": -2.59228515625, + "step": 419 + }, + { + "epoch": 0.08333746713626668, + "grad_norm": 38.436063094377126, + "learning_rate": 8.313492063492063e-07, + "logits/chosen": 4.29296875, + "logits/rejected": 4.4921875, + "logps/chosen": -819.0, + "logps/rejected": -740.0, + "loss": 0.5437, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.36328125, + "rewards/margins": 3.92578125, + "rewards/rejected": -2.5625, + "step": 420 + }, + { + "epoch": 0.08353588967706732, + "grad_norm": 50.996972936460764, + "learning_rate": 8.333333333333333e-07, + "logits/chosen": 4.0, + "logits/rejected": 4.453125, + "logps/chosen": -1184.0, + "logps/rejected": -956.0, + "loss": 0.5099, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.1669921875, + "rewards/margins": 3.765625, + "rewards/rejected": -2.6015625, + "step": 421 + }, + { + "epoch": 0.08373431221786795, + "grad_norm": 42.74059187221395, + "learning_rate": 8.353174603174603e-07, + "logits/chosen": 3.578125, + "logits/rejected": 3.9140625, + "logps/chosen": -961.0, + "logps/rejected": -2062.0, + "loss": 0.5353, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.0791015625, + "rewards/margins": 5.015625, + "rewards/rejected": -3.9404296875, + "step": 422 + }, + { + "epoch": 0.08393273475866858, + "grad_norm": 60.675966568269665, + "learning_rate": 8.373015873015873e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.41796875, + "logps/chosen": -901.0, + "logps/rejected": -1065.0, + "loss": 0.621, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.0654296875, + "rewards/margins": 2.65625, + "rewards/rejected": -1.5908203125, + "step": 423 + }, + { + "epoch": 0.08413115729946923, + "grad_norm": 49.49384961213828, + "learning_rate": 8.392857142857143e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 4.23828125, + "logps/chosen": -1192.0, + "logps/rejected": -616.0, + "loss": 0.5079, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.341796875, + "rewards/margins": 3.90234375, + "rewards/rejected": -2.564453125, + "step": 424 + }, + { + "epoch": 0.08432957984026986, + "grad_norm": 50.8947309972659, + "learning_rate": 8.412698412698413e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 4.42578125, + "logps/chosen": -828.0, + "logps/rejected": -1075.5, + "loss": 0.5184, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1396484375, + "rewards/margins": 4.2421875, + "rewards/rejected": -3.091796875, + "step": 425 + }, + { + "epoch": 0.08452800238107049, + "grad_norm": 46.626779475947316, + "learning_rate": 8.432539682539682e-07, + "logits/chosen": 4.046875, + "logits/rejected": 4.0859375, + "logps/chosen": -1114.0, + "logps/rejected": -796.5, + "loss": 0.5332, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4130859375, + "rewards/margins": 3.828125, + "rewards/rejected": -2.4140625, + "step": 426 + }, + { + "epoch": 0.08472642492187113, + "grad_norm": 51.96830976922309, + "learning_rate": 8.452380952380952e-07, + "logits/chosen": 4.46875, + "logits/rejected": 4.703125, + "logps/chosen": -770.0, + "logps/rejected": -1063.5, + "loss": 0.6013, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.9765625, + "rewards/margins": 3.078125, + "rewards/rejected": -2.09375, + "step": 427 + }, + { + "epoch": 0.08492484746267176, + "grad_norm": 53.99463108984334, + "learning_rate": 8.472222222222222e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.03125, + "logps/chosen": -1305.0, + "logps/rejected": -713.0, + "loss": 0.5747, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.095703125, + "rewards/margins": 2.87109375, + "rewards/rejected": -1.775390625, + "step": 428 + }, + { + "epoch": 0.08512327000347239, + "grad_norm": 49.226791744388386, + "learning_rate": 8.492063492063492e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.33203125, + "logps/chosen": -1203.0, + "logps/rejected": -688.0, + "loss": 0.455, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.7294921875, + "rewards/margins": 3.9375, + "rewards/rejected": -2.201171875, + "step": 429 + }, + { + "epoch": 0.08532169254427303, + "grad_norm": 43.81616590697778, + "learning_rate": 8.511904761904761e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 3.88671875, + "logps/chosen": -998.0, + "logps/rejected": -781.0, + "loss": 0.5375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.205078125, + "rewards/margins": 3.376953125, + "rewards/rejected": -2.171875, + "step": 430 + }, + { + "epoch": 0.08552011508507366, + "grad_norm": 52.74442016805919, + "learning_rate": 8.531746031746031e-07, + "logits/chosen": 4.39453125, + "logits/rejected": 4.6484375, + "logps/chosen": -1257.0, + "logps/rejected": -715.5, + "loss": 0.4626, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1875, + "rewards/margins": 4.27734375, + "rewards/rejected": -3.08984375, + "step": 431 + }, + { + "epoch": 0.0857185376258743, + "grad_norm": 103.27291076253678, + "learning_rate": 8.5515873015873e-07, + "logits/chosen": 4.5390625, + "logits/rejected": 4.609375, + "logps/chosen": -924.0, + "logps/rejected": -789.0, + "loss": 0.5839, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.05810546875, + "rewards/margins": 2.9765625, + "rewards/rejected": -1.9140625, + "step": 432 + }, + { + "epoch": 0.08591696016667494, + "grad_norm": 48.53537827663354, + "learning_rate": 8.57142857142857e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 4.2109375, + "logps/chosen": -938.5, + "logps/rejected": -721.0, + "loss": 0.5788, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.09130859375, + "rewards/margins": 3.0390625, + "rewards/rejected": -1.9453125, + "step": 433 + }, + { + "epoch": 0.08611538270747557, + "grad_norm": 52.693686911347285, + "learning_rate": 8.59126984126984e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 4.05859375, + "logps/chosen": -1142.0, + "logps/rejected": -829.0, + "loss": 0.5543, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.747802734375, + "rewards/margins": 3.74609375, + "rewards/rejected": -3.0, + "step": 434 + }, + { + "epoch": 0.0863138052482762, + "grad_norm": 54.12737938512882, + "learning_rate": 8.611111111111111e-07, + "logits/chosen": 4.4765625, + "logits/rejected": 4.578125, + "logps/chosen": -924.0, + "logps/rejected": -784.0, + "loss": 0.6292, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8896484375, + "rewards/margins": 3.0390625, + "rewards/rejected": -2.1494140625, + "step": 435 + }, + { + "epoch": 0.08651222778907684, + "grad_norm": 54.33255716131601, + "learning_rate": 8.630952380952381e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.390625, + "logps/chosen": -979.0, + "logps/rejected": -1299.5, + "loss": 0.5267, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.07421875, + "rewards/margins": 4.203125, + "rewards/rejected": -3.1328125, + "step": 436 + }, + { + "epoch": 0.08671065032987747, + "grad_norm": 48.04702900315347, + "learning_rate": 8.650793650793651e-07, + "logits/chosen": 4.3984375, + "logits/rejected": 4.59375, + "logps/chosen": -842.0, + "logps/rejected": -731.0, + "loss": 0.6694, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.83154296875, + "rewards/margins": 3.03515625, + "rewards/rejected": -2.20703125, + "step": 437 + }, + { + "epoch": 0.0869090728706781, + "grad_norm": 57.67623957292311, + "learning_rate": 8.67063492063492e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.1796875, + "logps/chosen": -1178.0, + "logps/rejected": -1002.0, + "loss": 0.4848, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.96875, + "rewards/margins": 4.4140625, + "rewards/rejected": -3.4453125, + "step": 438 + }, + { + "epoch": 0.08710749541147875, + "grad_norm": 73.26946037348888, + "learning_rate": 8.69047619047619e-07, + "logits/chosen": 3.75, + "logits/rejected": 3.95703125, + "logps/chosen": -980.5, + "logps/rejected": -955.0, + "loss": 0.6221, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.796875, + "rewards/margins": 2.6171875, + "rewards/rejected": -1.8212890625, + "step": 439 + }, + { + "epoch": 0.08730591795227938, + "grad_norm": 46.11714077226818, + "learning_rate": 8.71031746031746e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 4.4140625, + "logps/chosen": -678.0, + "logps/rejected": -1724.5, + "loss": 0.5606, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.927734375, + "rewards/margins": 4.37890625, + "rewards/rejected": -3.4453125, + "step": 440 + }, + { + "epoch": 0.08750434049308001, + "grad_norm": 42.543847234044236, + "learning_rate": 8.73015873015873e-07, + "logits/chosen": 3.8828125, + "logits/rejected": 3.8203125, + "logps/chosen": -1094.0, + "logps/rejected": -991.0, + "loss": 0.6336, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.2803955078125, + "rewards/margins": 3.2265625, + "rewards/rejected": -1.94921875, + "step": 441 + }, + { + "epoch": 0.08770276303388065, + "grad_norm": 50.48799489521222, + "learning_rate": 8.75e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 4.59375, + "logps/chosen": -1189.0, + "logps/rejected": -792.0, + "loss": 0.626, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.1103515625, + "rewards/margins": 3.02734375, + "rewards/rejected": -1.91796875, + "step": 442 + }, + { + "epoch": 0.08790118557468128, + "grad_norm": 46.64995099218814, + "learning_rate": 8.76984126984127e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.72265625, + "logps/chosen": -996.0, + "logps/rejected": -689.0, + "loss": 0.6253, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.013427734375, + "rewards/margins": 3.1484375, + "rewards/rejected": -2.140625, + "step": 443 + }, + { + "epoch": 0.08809960811548193, + "grad_norm": 57.096717449079115, + "learning_rate": 8.78968253968254e-07, + "logits/chosen": 3.46875, + "logits/rejected": 3.5625, + "logps/chosen": -975.0, + "logps/rejected": -832.0, + "loss": 0.485, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.599609375, + "rewards/margins": 3.765625, + "rewards/rejected": -2.1640625, + "step": 444 + }, + { + "epoch": 0.08829803065628256, + "grad_norm": 51.528884023838415, + "learning_rate": 8.809523809523809e-07, + "logits/chosen": 3.5390625, + "logits/rejected": 3.73046875, + "logps/chosen": -1040.0, + "logps/rejected": -644.0, + "loss": 0.5713, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.859375, + "rewards/margins": 2.92578125, + "rewards/rejected": -2.0703125, + "step": 445 + }, + { + "epoch": 0.08849645319708319, + "grad_norm": 38.472590894291656, + "learning_rate": 8.829365079365078e-07, + "logits/chosen": 4.37890625, + "logits/rejected": 4.5859375, + "logps/chosen": -843.5, + "logps/rejected": -660.5, + "loss": 0.4817, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.3955078125, + "rewards/margins": 3.96484375, + "rewards/rejected": -2.564453125, + "step": 446 + }, + { + "epoch": 0.08869487573788383, + "grad_norm": 52.039850124522054, + "learning_rate": 8.849206349206348e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.26171875, + "logps/chosen": -1047.0, + "logps/rejected": -775.5, + "loss": 0.613, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.3046875, + "rewards/margins": 3.140625, + "rewards/rejected": -1.8359375, + "step": 447 + }, + { + "epoch": 0.08889329827868446, + "grad_norm": 55.46497987440143, + "learning_rate": 8.869047619047618e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.03125, + "logps/chosen": -1696.0, + "logps/rejected": -596.5, + "loss": 0.6872, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.265625, + "rewards/margins": 1.52734375, + "rewards/rejected": -1.26171875, + "step": 448 + }, + { + "epoch": 0.08909172081948509, + "grad_norm": 41.76700500513939, + "learning_rate": 8.888888888888888e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.1171875, + "logps/chosen": -933.0, + "logps/rejected": -561.0, + "loss": 0.4648, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.314453125, + "rewards/margins": 3.37109375, + "rewards/rejected": -2.05859375, + "step": 449 + }, + { + "epoch": 0.08929014336028573, + "grad_norm": 49.43227851262878, + "learning_rate": 8.908730158730159e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 4.16015625, + "logps/chosen": -946.0, + "logps/rejected": -658.0, + "loss": 0.6582, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.7041015625, + "rewards/margins": 3.751953125, + "rewards/rejected": -3.048828125, + "step": 450 + }, + { + "epoch": 0.08948856590108636, + "grad_norm": 82.934496785908, + "learning_rate": 8.928571428571428e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 4.0546875, + "logps/chosen": -972.0, + "logps/rejected": -956.0, + "loss": 0.5947, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.904296875, + "rewards/margins": 6.03515625, + "rewards/rejected": -4.130859375, + "step": 451 + }, + { + "epoch": 0.089686988441887, + "grad_norm": 40.46098412915802, + "learning_rate": 8.948412698412698e-07, + "logits/chosen": 4.19921875, + "logits/rejected": 4.359375, + "logps/chosen": -738.0, + "logps/rejected": -524.0, + "loss": 0.6126, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.78564453125, + "rewards/margins": 3.0927734375, + "rewards/rejected": -2.3115234375, + "step": 452 + }, + { + "epoch": 0.08988541098268764, + "grad_norm": 43.87417737704721, + "learning_rate": 8.968253968253968e-07, + "logits/chosen": 4.484375, + "logits/rejected": 4.46875, + "logps/chosen": -1015.0, + "logps/rejected": -724.0, + "loss": 0.5347, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.4873046875, + "rewards/margins": 3.546875, + "rewards/rejected": -2.0546875, + "step": 453 + }, + { + "epoch": 0.09008383352348827, + "grad_norm": 52.79300056636686, + "learning_rate": 8.988095238095238e-07, + "logits/chosen": 4.671875, + "logits/rejected": 4.6796875, + "logps/chosen": -1311.0, + "logps/rejected": -896.0, + "loss": 0.5462, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.361328125, + "rewards/margins": 4.04296875, + "rewards/rejected": -2.681640625, + "step": 454 + }, + { + "epoch": 0.0902822560642889, + "grad_norm": 49.024499605812544, + "learning_rate": 9.007936507936508e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 4.04296875, + "logps/chosen": -806.0, + "logps/rejected": -559.0, + "loss": 0.5759, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7685546875, + "rewards/margins": 3.173828125, + "rewards/rejected": -2.4072265625, + "step": 455 + }, + { + "epoch": 0.09048067860508954, + "grad_norm": 48.35833618892343, + "learning_rate": 9.027777777777778e-07, + "logits/chosen": 3.890625, + "logits/rejected": 4.171875, + "logps/chosen": -1068.0, + "logps/rejected": -665.0, + "loss": 0.6262, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.2509765625, + "rewards/margins": 3.08203125, + "rewards/rejected": -1.83203125, + "step": 456 + }, + { + "epoch": 0.09067910114589017, + "grad_norm": 47.42879155098235, + "learning_rate": 9.047619047619047e-07, + "logits/chosen": 3.77734375, + "logits/rejected": 4.15625, + "logps/chosen": -1143.5, + "logps/rejected": -981.5, + "loss": 0.501, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.98876953125, + "rewards/margins": 4.142578125, + "rewards/rejected": -3.14453125, + "step": 457 + }, + { + "epoch": 0.0908775236866908, + "grad_norm": 45.64393229447945, + "learning_rate": 9.067460317460317e-07, + "logits/chosen": 3.953125, + "logits/rejected": 4.26953125, + "logps/chosen": -986.0, + "logps/rejected": -914.0, + "loss": 0.5461, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.1162109375, + "rewards/margins": 3.625, + "rewards/rejected": -2.505859375, + "step": 458 + }, + { + "epoch": 0.09107594622749145, + "grad_norm": 56.99339713587863, + "learning_rate": 9.087301587301587e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 4.1953125, + "logps/chosen": -1235.0, + "logps/rejected": -643.0, + "loss": 0.4844, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.0078125, + "rewards/margins": 3.9765625, + "rewards/rejected": -2.96875, + "step": 459 + }, + { + "epoch": 0.09127436876829208, + "grad_norm": 47.551816709688794, + "learning_rate": 9.107142857142857e-07, + "logits/chosen": 3.921875, + "logits/rejected": 3.98828125, + "logps/chosen": -884.0, + "logps/rejected": -630.0, + "loss": 0.5646, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.82177734375, + "rewards/margins": 3.3359375, + "rewards/rejected": -2.509765625, + "step": 460 + }, + { + "epoch": 0.09147279130909271, + "grad_norm": 52.56518570796189, + "learning_rate": 9.126984126984127e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.3125, + "logps/chosen": -1115.0, + "logps/rejected": -1340.5, + "loss": 0.5939, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.2236328125, + "rewards/margins": 4.046875, + "rewards/rejected": -2.81640625, + "step": 461 + }, + { + "epoch": 0.09167121384989335, + "grad_norm": 44.062346124479646, + "learning_rate": 9.146825396825397e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.359375, + "logps/chosen": -1071.0, + "logps/rejected": -477.5, + "loss": 0.5986, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.8515625, + "rewards/margins": 3.34765625, + "rewards/rejected": -2.498046875, + "step": 462 + }, + { + "epoch": 0.09186963639069398, + "grad_norm": 54.444160127638135, + "learning_rate": 9.166666666666665e-07, + "logits/chosen": 3.921875, + "logits/rejected": 3.75, + "logps/chosen": -1138.0, + "logps/rejected": -838.5, + "loss": 0.5584, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.39453125, + "rewards/margins": 3.4921875, + "rewards/rejected": -2.0966796875, + "step": 463 + }, + { + "epoch": 0.09206805893149461, + "grad_norm": 40.2109209961371, + "learning_rate": 9.186507936507935e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.07421875, + "logps/chosen": -1124.5, + "logps/rejected": -1867.5, + "loss": 0.6451, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.80078125, + "rewards/margins": 4.263671875, + "rewards/rejected": -3.4609375, + "step": 464 + }, + { + "epoch": 0.09226648147229526, + "grad_norm": 44.64450290759425, + "learning_rate": 9.206349206349205e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 4.03515625, + "logps/chosen": -958.0, + "logps/rejected": -719.0, + "loss": 0.5765, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.712158203125, + "rewards/margins": 3.09375, + "rewards/rejected": -2.3828125, + "step": 465 + }, + { + "epoch": 0.09246490401309589, + "grad_norm": 51.527923537500484, + "learning_rate": 9.226190476190476e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 4.0390625, + "logps/chosen": -1078.0, + "logps/rejected": -719.0, + "loss": 0.4544, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.0654296875, + "rewards/margins": 3.9609375, + "rewards/rejected": -2.890625, + "step": 466 + }, + { + "epoch": 0.09266332655389652, + "grad_norm": 47.27225580672264, + "learning_rate": 9.246031746031746e-07, + "logits/chosen": 4.203125, + "logits/rejected": 3.98046875, + "logps/chosen": -1067.0, + "logps/rejected": -924.0, + "loss": 0.5471, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9287109375, + "rewards/margins": 3.76171875, + "rewards/rejected": -2.828125, + "step": 467 + }, + { + "epoch": 0.09286174909469716, + "grad_norm": 50.21937956361417, + "learning_rate": 9.265873015873016e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.2734375, + "logps/chosen": -1085.0, + "logps/rejected": -769.0, + "loss": 0.487, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.84912109375, + "rewards/margins": 3.66015625, + "rewards/rejected": -2.8046875, + "step": 468 + }, + { + "epoch": 0.09306017163549779, + "grad_norm": 39.97478622947582, + "learning_rate": 9.285714285714285e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.95703125, + "logps/chosen": -1082.0, + "logps/rejected": -856.0, + "loss": 0.5275, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.50390625, + "rewards/margins": 4.35546875, + "rewards/rejected": -2.8515625, + "step": 469 + }, + { + "epoch": 0.09325859417629843, + "grad_norm": 40.02473238072967, + "learning_rate": 9.305555555555555e-07, + "logits/chosen": 4.23046875, + "logits/rejected": 4.265625, + "logps/chosen": -922.0, + "logps/rejected": -1557.0, + "loss": 0.5228, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.091796875, + "rewards/margins": 4.94921875, + "rewards/rejected": -3.8515625, + "step": 470 + }, + { + "epoch": 0.09345701671709906, + "grad_norm": 67.6531476169713, + "learning_rate": 9.325396825396825e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 4.41796875, + "logps/chosen": -847.5, + "logps/rejected": -673.0, + "loss": 0.5737, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.544189453125, + "rewards/margins": 3.47265625, + "rewards/rejected": -2.9296875, + "step": 471 + }, + { + "epoch": 0.0936554392578997, + "grad_norm": 33.653828646877244, + "learning_rate": 9.345238095238095e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.23828125, + "logps/chosen": -682.0, + "logps/rejected": -946.0, + "loss": 0.6052, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.1796875, + "rewards/margins": 3.40625, + "rewards/rejected": -2.224609375, + "step": 472 + }, + { + "epoch": 0.09385386179870034, + "grad_norm": 49.656731964373535, + "learning_rate": 9.365079365079365e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 3.9609375, + "logps/chosen": -941.0, + "logps/rejected": -1621.0, + "loss": 0.5427, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.71240234375, + "rewards/margins": 5.78125, + "rewards/rejected": -5.08203125, + "step": 473 + }, + { + "epoch": 0.09405228433950097, + "grad_norm": 45.27986909585309, + "learning_rate": 9.384920634920635e-07, + "logits/chosen": 3.78125, + "logits/rejected": 3.9140625, + "logps/chosen": -843.0, + "logps/rejected": -671.0, + "loss": 0.5302, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.06689453125, + "rewards/margins": 3.41796875, + "rewards/rejected": -2.35546875, + "step": 474 + }, + { + "epoch": 0.0942507068803016, + "grad_norm": 58.409979835316136, + "learning_rate": 9.404761904761904e-07, + "logits/chosen": 4.546875, + "logits/rejected": 4.32421875, + "logps/chosen": -1042.0, + "logps/rejected": -765.0, + "loss": 0.6366, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.9443359375, + "rewards/margins": 2.70703125, + "rewards/rejected": -1.7578125, + "step": 475 + }, + { + "epoch": 0.09444912942110224, + "grad_norm": 47.84847404133748, + "learning_rate": 9.424603174603174e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 4.1171875, + "logps/chosen": -866.0, + "logps/rejected": -527.5, + "loss": 0.6424, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.088134765625, + "rewards/margins": 2.2568359375, + "rewards/rejected": -1.1708984375, + "step": 476 + }, + { + "epoch": 0.09464755196190287, + "grad_norm": 42.40246405820648, + "learning_rate": 9.444444444444444e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 4.04296875, + "logps/chosen": -1069.0, + "logps/rejected": -693.5, + "loss": 0.556, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.23291015625, + "rewards/margins": 3.46484375, + "rewards/rejected": -2.234375, + "step": 477 + }, + { + "epoch": 0.0948459745027035, + "grad_norm": 46.16736173589445, + "learning_rate": 9.464285714285714e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.38671875, + "logps/chosen": -743.0, + "logps/rejected": -599.0, + "loss": 0.6113, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.19921875, + "rewards/margins": 2.55859375, + "rewards/rejected": -1.36328125, + "step": 478 + }, + { + "epoch": 0.09504439704350415, + "grad_norm": 49.30454544809727, + "learning_rate": 9.484126984126984e-07, + "logits/chosen": 3.953125, + "logits/rejected": 4.16015625, + "logps/chosen": -1217.0, + "logps/rejected": -713.0, + "loss": 0.5146, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.416015625, + "rewards/margins": 3.65234375, + "rewards/rejected": -2.2421875, + "step": 479 + }, + { + "epoch": 0.09524281958430478, + "grad_norm": 59.61380378584687, + "learning_rate": 9.503968253968253e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.28515625, + "logps/chosen": -792.0, + "logps/rejected": -773.5, + "loss": 0.6006, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.951171875, + "rewards/margins": 2.810546875, + "rewards/rejected": -1.857421875, + "step": 480 + }, + { + "epoch": 0.09544124212510541, + "grad_norm": 49.249767584950135, + "learning_rate": 9.523809523809522e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.16015625, + "logps/chosen": -1952.0, + "logps/rejected": -697.5, + "loss": 0.4595, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.541015625, + "rewards/margins": 3.89453125, + "rewards/rejected": -3.35546875, + "step": 481 + }, + { + "epoch": 0.09563966466590605, + "grad_norm": 50.47735172446226, + "learning_rate": 9.543650793650794e-07, + "logits/chosen": 4.578125, + "logits/rejected": 4.40625, + "logps/chosen": -927.0, + "logps/rejected": -571.0, + "loss": 0.5741, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.01953125, + "rewards/margins": 3.23828125, + "rewards/rejected": -2.224609375, + "step": 482 + }, + { + "epoch": 0.09583808720670668, + "grad_norm": 42.399560131366485, + "learning_rate": 9.563492063492063e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.3359375, + "logps/chosen": -1019.0, + "logps/rejected": -673.0, + "loss": 0.5451, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.853759765625, + "rewards/margins": 3.421875, + "rewards/rejected": -2.5703125, + "step": 483 + }, + { + "epoch": 0.09603650974750731, + "grad_norm": 45.13395159324945, + "learning_rate": 9.583333333333334e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 4.6640625, + "logps/chosen": -833.0, + "logps/rejected": -1498.0, + "loss": 0.6212, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.585784912109375, + "rewards/margins": 4.5703125, + "rewards/rejected": -2.984375, + "step": 484 + }, + { + "epoch": 0.09623493228830796, + "grad_norm": 52.38027919342259, + "learning_rate": 9.603174603174603e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.1484375, + "logps/chosen": -1191.0, + "logps/rejected": -696.0, + "loss": 0.4781, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.2275390625, + "rewards/margins": 4.4375, + "rewards/rejected": -3.21875, + "step": 485 + }, + { + "epoch": 0.09643335482910859, + "grad_norm": 37.92284420705475, + "learning_rate": 9.623015873015874e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 4.484375, + "logps/chosen": -921.0, + "logps/rejected": -659.0, + "loss": 0.5577, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.407470703125, + "rewards/margins": 3.048828125, + "rewards/rejected": -1.640625, + "step": 486 + }, + { + "epoch": 0.09663177736990922, + "grad_norm": 36.9012975318354, + "learning_rate": 9.642857142857142e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 3.86328125, + "logps/chosen": -909.0, + "logps/rejected": -718.0, + "loss": 0.6149, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.50732421875, + "rewards/margins": 3.0234375, + "rewards/rejected": -1.5185546875, + "step": 487 + }, + { + "epoch": 0.09683019991070986, + "grad_norm": 48.286714640029054, + "learning_rate": 9.662698412698413e-07, + "logits/chosen": 4.44921875, + "logits/rejected": 4.203125, + "logps/chosen": -1492.0, + "logps/rejected": -663.5, + "loss": 0.4675, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.3076171875, + "rewards/margins": 3.87890625, + "rewards/rejected": -2.572265625, + "step": 488 + }, + { + "epoch": 0.09702862245151049, + "grad_norm": 54.2980048864875, + "learning_rate": 9.682539682539682e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.4375, + "logps/chosen": -1286.0, + "logps/rejected": -878.5, + "loss": 0.5785, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.095703125, + "rewards/margins": 3.890625, + "rewards/rejected": -2.80224609375, + "step": 489 + }, + { + "epoch": 0.09722704499231112, + "grad_norm": 43.727098714484704, + "learning_rate": 9.70238095238095e-07, + "logits/chosen": 3.90625, + "logits/rejected": 4.1640625, + "logps/chosen": -974.5, + "logps/rejected": -742.0, + "loss": 0.5507, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.02734375, + "rewards/margins": 6.640625, + "rewards/rejected": -5.615234375, + "step": 490 + }, + { + "epoch": 0.09742546753311176, + "grad_norm": 51.526441731421265, + "learning_rate": 9.722222222222222e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 4.16015625, + "logps/chosen": -1288.0, + "logps/rejected": -983.0, + "loss": 0.5477, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.54736328125, + "rewards/margins": 4.8046875, + "rewards/rejected": -4.26171875, + "step": 491 + }, + { + "epoch": 0.0976238900739124, + "grad_norm": 36.045259126714974, + "learning_rate": 9.74206349206349e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.234375, + "logps/chosen": -1073.0, + "logps/rejected": -734.0, + "loss": 0.4517, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.423828125, + "rewards/margins": 10.12109375, + "rewards/rejected": -8.6875, + "step": 492 + }, + { + "epoch": 0.09782231261471303, + "grad_norm": 43.953089746212584, + "learning_rate": 9.761904761904762e-07, + "logits/chosen": 4.19921875, + "logits/rejected": 4.2890625, + "logps/chosen": -893.0, + "logps/rejected": -655.0, + "loss": 0.5334, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.2294921875, + "rewards/margins": 3.537109375, + "rewards/rejected": -2.2998046875, + "step": 493 + }, + { + "epoch": 0.09802073515551367, + "grad_norm": 39.58262419501216, + "learning_rate": 9.78174603174603e-07, + "logits/chosen": 3.66796875, + "logits/rejected": 3.83203125, + "logps/chosen": -717.0, + "logps/rejected": -654.5, + "loss": 0.5163, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.708984375, + "rewards/margins": 3.359375, + "rewards/rejected": -2.650390625, + "step": 494 + }, + { + "epoch": 0.0982191576963143, + "grad_norm": 42.936687817075665, + "learning_rate": 9.801587301587301e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.33203125, + "logps/chosen": -965.0, + "logps/rejected": -690.0, + "loss": 0.5546, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.332275390625, + "rewards/margins": 3.6875, + "rewards/rejected": -2.35546875, + "step": 495 + }, + { + "epoch": 0.09841758023711493, + "grad_norm": 46.81891701838191, + "learning_rate": 9.82142857142857e-07, + "logits/chosen": 3.5625, + "logits/rejected": 3.98828125, + "logps/chosen": -823.5, + "logps/rejected": -704.5, + "loss": 0.6012, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.06640625, + "rewards/margins": 3.203125, + "rewards/rejected": -2.138671875, + "step": 496 + }, + { + "epoch": 0.09861600277791557, + "grad_norm": 53.03338191696795, + "learning_rate": 9.84126984126984e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.453125, + "logps/chosen": -1124.0, + "logps/rejected": -2598.0, + "loss": 0.4851, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.216796875, + "rewards/margins": 7.0390625, + "rewards/rejected": -5.8359375, + "step": 497 + }, + { + "epoch": 0.0988144253187162, + "grad_norm": 38.94210829196083, + "learning_rate": 9.861111111111112e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.8671875, + "logps/chosen": -1119.0, + "logps/rejected": -678.5, + "loss": 0.4488, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.525390625, + "rewards/margins": 3.9453125, + "rewards/rejected": -2.42578125, + "step": 498 + }, + { + "epoch": 0.09901284785951685, + "grad_norm": 42.12155785131869, + "learning_rate": 9.88095238095238e-07, + "logits/chosen": 3.7734375, + "logits/rejected": 4.0234375, + "logps/chosen": -874.0, + "logps/rejected": -679.0, + "loss": 0.5457, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1572265625, + "rewards/margins": 3.21875, + "rewards/rejected": -2.060546875, + "step": 499 + }, + { + "epoch": 0.09921127040031748, + "grad_norm": 112.16555423319704, + "learning_rate": 9.900793650793651e-07, + "logits/chosen": 3.953125, + "logits/rejected": 3.93359375, + "logps/chosen": -1137.0, + "logps/rejected": -967.0, + "loss": 0.5952, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.28076171875, + "rewards/margins": 4.6328125, + "rewards/rejected": -2.345703125, + "step": 500 + }, + { + "epoch": 0.09940969294111811, + "grad_norm": 42.81189657747877, + "learning_rate": 9.92063492063492e-07, + "logits/chosen": 3.65625, + "logits/rejected": 3.6796875, + "logps/chosen": -872.0, + "logps/rejected": -556.0, + "loss": 0.5649, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.9993896484375, + "rewards/margins": 3.109375, + "rewards/rejected": -2.107421875, + "step": 501 + }, + { + "epoch": 0.09960811548191875, + "grad_norm": 53.55928258562378, + "learning_rate": 9.940476190476191e-07, + "logits/chosen": 3.68359375, + "logits/rejected": 3.81640625, + "logps/chosen": -1089.0, + "logps/rejected": -698.0, + "loss": 0.5461, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.0283203125, + "rewards/margins": 3.23046875, + "rewards/rejected": -2.201171875, + "step": 502 + }, + { + "epoch": 0.09980653802271938, + "grad_norm": 46.630402822291245, + "learning_rate": 9.96031746031746e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 4.05078125, + "logps/chosen": -893.0, + "logps/rejected": -606.5, + "loss": 0.6615, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.0029296875, + "rewards/margins": 2.30859375, + "rewards/rejected": -1.30419921875, + "step": 503 + }, + { + "epoch": 0.10000496056352001, + "grad_norm": 62.34729239416617, + "learning_rate": 9.98015873015873e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 3.9609375, + "logps/chosen": -1106.0, + "logps/rejected": -641.5, + "loss": 0.5666, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.064453125, + "rewards/margins": 3.27734375, + "rewards/rejected": -2.212890625, + "step": 504 + }, + { + "epoch": 0.10020338310432066, + "grad_norm": 43.36562394978288, + "learning_rate": 1e-06, + "logits/chosen": 4.37109375, + "logits/rejected": 4.578125, + "logps/chosen": -704.0, + "logps/rejected": -580.5, + "loss": 0.6357, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.44744873046875, + "rewards/margins": 2.4609375, + "rewards/rejected": -2.01171875, + "step": 505 + }, + { + "epoch": 0.10040180564512129, + "grad_norm": 44.0308616667386, + "learning_rate": 9.999998920714966e-07, + "logits/chosen": 4.28515625, + "logits/rejected": 4.625, + "logps/chosen": -1231.0, + "logps/rejected": -695.0, + "loss": 0.4562, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.732421875, + "rewards/margins": 4.3671875, + "rewards/rejected": -2.6328125, + "step": 506 + }, + { + "epoch": 0.10060022818592192, + "grad_norm": 47.080491181307174, + "learning_rate": 9.999995682860387e-07, + "logits/chosen": 3.79296875, + "logits/rejected": 4.0390625, + "logps/chosen": -1124.0, + "logps/rejected": -836.0, + "loss": 0.5357, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.26953125, + "rewards/margins": 4.0, + "rewards/rejected": -2.7265625, + "step": 507 + }, + { + "epoch": 0.10079865072672256, + "grad_norm": 42.77555340710978, + "learning_rate": 9.99999028643781e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 4.15625, + "logps/chosen": -1002.0, + "logps/rejected": -687.5, + "loss": 0.5153, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.33544921875, + "rewards/margins": 3.734375, + "rewards/rejected": -2.400390625, + "step": 508 + }, + { + "epoch": 0.10099707326752319, + "grad_norm": 45.2873238033212, + "learning_rate": 9.999982731449832e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.6015625, + "logps/chosen": -981.0, + "logps/rejected": -619.0, + "loss": 0.5218, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.01318359375, + "rewards/margins": 3.875, + "rewards/rejected": -2.859375, + "step": 509 + }, + { + "epoch": 0.10119549580832382, + "grad_norm": 48.941586116963826, + "learning_rate": 9.999973017900069e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 4.1171875, + "logps/chosen": -707.5, + "logps/rejected": -739.5, + "loss": 0.5791, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.013916015625, + "rewards/margins": 3.7265625, + "rewards/rejected": -2.7109375, + "step": 510 + }, + { + "epoch": 0.10139391834912446, + "grad_norm": 41.8361058459073, + "learning_rate": 9.999961145793185e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.15234375, + "logps/chosen": -836.5, + "logps/rejected": -808.0, + "loss": 0.4849, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2255859375, + "rewards/margins": 4.39453125, + "rewards/rejected": -3.17578125, + "step": 511 + }, + { + "epoch": 0.1015923408899251, + "grad_norm": 39.50731275849444, + "learning_rate": 9.999947115134872e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.6015625, + "logps/chosen": -910.0, + "logps/rejected": -1536.0, + "loss": 0.5732, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.6201171875, + "rewards/margins": 4.86328125, + "rewards/rejected": -3.244140625, + "step": 512 + }, + { + "epoch": 0.10179076343072573, + "grad_norm": 55.62503406203584, + "learning_rate": 9.999930925931862e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.234375, + "logps/chosen": -880.0, + "logps/rejected": -679.0, + "loss": 0.6164, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.63604736328125, + "rewards/margins": 2.8994140625, + "rewards/rejected": -2.265625, + "step": 513 + }, + { + "epoch": 0.10198918597152637, + "grad_norm": 40.632904791999835, + "learning_rate": 9.999912578191921e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 4.30859375, + "logps/chosen": -1042.0, + "logps/rejected": -1120.0, + "loss": 0.4683, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.654296875, + "rewards/margins": 4.42578125, + "rewards/rejected": -2.7646484375, + "step": 514 + }, + { + "epoch": 0.102187608512327, + "grad_norm": 48.41046155778987, + "learning_rate": 9.99989207192385e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.2109375, + "logps/chosen": -1156.0, + "logps/rejected": -840.0, + "loss": 0.4994, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.24072265625, + "rewards/margins": 3.9453125, + "rewards/rejected": -2.7021484375, + "step": 515 + }, + { + "epoch": 0.10238603105312763, + "grad_norm": 44.12664497940824, + "learning_rate": 9.999869407137483e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.09765625, + "logps/chosen": -898.5, + "logps/rejected": -1145.0, + "loss": 0.575, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.98583984375, + "rewards/margins": 4.44921875, + "rewards/rejected": -3.458984375, + "step": 516 + }, + { + "epoch": 0.10258445359392827, + "grad_norm": 40.83261263364419, + "learning_rate": 9.999844583843695e-07, + "logits/chosen": 3.51953125, + "logits/rejected": 3.734375, + "logps/chosen": -965.0, + "logps/rejected": -758.5, + "loss": 0.4763, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.529296875, + "rewards/margins": 3.98046875, + "rewards/rejected": -2.453125, + "step": 517 + }, + { + "epoch": 0.1027828761347289, + "grad_norm": 49.704744682741236, + "learning_rate": 9.999817602054392e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.875, + "logps/chosen": -868.0, + "logps/rejected": -1359.5, + "loss": 0.6442, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.759765625, + "rewards/margins": 3.9453125, + "rewards/rejected": -3.189453125, + "step": 518 + }, + { + "epoch": 0.10298129867552953, + "grad_norm": 40.9716707785936, + "learning_rate": 9.999788461782516e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 4.3125, + "logps/chosen": -772.0, + "logps/rejected": -829.0, + "loss": 0.5824, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.119140625, + "rewards/margins": 3.27734375, + "rewards/rejected": -2.162109375, + "step": 519 + }, + { + "epoch": 0.10317972121633018, + "grad_norm": 58.78349126653634, + "learning_rate": 9.999757163042046e-07, + "logits/chosen": 4.26171875, + "logits/rejected": 4.15234375, + "logps/chosen": -1065.0, + "logps/rejected": -707.5, + "loss": 0.5536, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.10986328125, + "rewards/margins": 3.939453125, + "rewards/rejected": -2.828125, + "step": 520 + }, + { + "epoch": 0.10337814375713081, + "grad_norm": 49.787036087138496, + "learning_rate": 9.999723705847997e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 4.15234375, + "logps/chosen": -1187.0, + "logps/rejected": -649.0, + "loss": 0.5041, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.466796875, + "rewards/margins": 3.76171875, + "rewards/rejected": -2.29296875, + "step": 521 + }, + { + "epoch": 0.10357656629793144, + "grad_norm": 60.55380456400088, + "learning_rate": 9.999688090216417e-07, + "logits/chosen": 3.53125, + "logits/rejected": 3.69921875, + "logps/chosen": -937.0, + "logps/rejected": -630.0, + "loss": 0.7204, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.1064453125, + "rewards/margins": 2.3193359375, + "rewards/rejected": -2.2177734375, + "step": 522 + }, + { + "epoch": 0.10377498883873208, + "grad_norm": 42.71541867682342, + "learning_rate": 9.999650316164387e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 3.98046875, + "logps/chosen": -956.0, + "logps/rejected": -724.0, + "loss": 0.551, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.748046875, + "rewards/margins": 3.8359375, + "rewards/rejected": -3.09375, + "step": 523 + }, + { + "epoch": 0.10397341137953271, + "grad_norm": 56.3919090432713, + "learning_rate": 9.999610383710028e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 4.0078125, + "logps/chosen": -951.0, + "logps/rejected": -504.5, + "loss": 0.6337, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28997802734375, + "rewards/margins": 2.416015625, + "rewards/rejected": -2.12890625, + "step": 524 + }, + { + "epoch": 0.10417183392033336, + "grad_norm": 44.31335194939229, + "learning_rate": 9.999568292872498e-07, + "logits/chosen": 4.35546875, + "logits/rejected": 4.4921875, + "logps/chosen": -948.0, + "logps/rejected": -660.5, + "loss": 0.5423, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.982421875, + "rewards/margins": 3.5546875, + "rewards/rejected": -2.56640625, + "step": 525 + }, + { + "epoch": 0.10437025646113399, + "grad_norm": 50.791154990053094, + "learning_rate": 9.999524043671984e-07, + "logits/chosen": 3.984375, + "logits/rejected": 4.32421875, + "logps/chosen": -1069.0, + "logps/rejected": -681.5, + "loss": 0.511, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.9365234375, + "rewards/margins": 3.83984375, + "rewards/rejected": -2.91015625, + "step": 526 + }, + { + "epoch": 0.10456867900193462, + "grad_norm": 41.56685069650134, + "learning_rate": 9.999477636129714e-07, + "logits/chosen": 3.5859375, + "logits/rejected": 4.125, + "logps/chosen": -1045.0, + "logps/rejected": -616.0, + "loss": 0.5304, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.67431640625, + "rewards/margins": 3.47265625, + "rewards/rejected": -2.798828125, + "step": 527 + }, + { + "epoch": 0.10476710154273526, + "grad_norm": 38.4315775758853, + "learning_rate": 9.999429070267945e-07, + "logits/chosen": 3.7734375, + "logits/rejected": 3.94921875, + "logps/chosen": -853.0, + "logps/rejected": -685.0, + "loss": 0.4176, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.26953125, + "rewards/margins": 6.953125, + "rewards/rejected": -5.69140625, + "step": 528 + }, + { + "epoch": 0.10496552408353589, + "grad_norm": 39.47044320131969, + "learning_rate": 9.999378346109974e-07, + "logits/chosen": 3.75390625, + "logits/rejected": 4.03515625, + "logps/chosen": -710.5, + "logps/rejected": -451.5, + "loss": 0.6279, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.43310546875, + "rewards/margins": 3.1591796875, + "rewards/rejected": -1.7230224609375, + "step": 529 + }, + { + "epoch": 0.10516394662433652, + "grad_norm": 51.82816484547445, + "learning_rate": 9.999325463680136e-07, + "logits/chosen": 3.765625, + "logits/rejected": 3.82421875, + "logps/chosen": -965.0, + "logps/rejected": -732.5, + "loss": 0.5716, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.3095703125, + "rewards/margins": 3.34765625, + "rewards/rejected": -2.0400390625, + "step": 530 + }, + { + "epoch": 0.10536236916513717, + "grad_norm": 43.45061471811763, + "learning_rate": 9.9992704230038e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 4.1484375, + "logps/chosen": -811.5, + "logps/rejected": -617.0, + "loss": 0.5106, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.9814453125, + "rewards/margins": 3.63671875, + "rewards/rejected": -2.6484375, + "step": 531 + }, + { + "epoch": 0.1055607917059378, + "grad_norm": 42.93848807064919, + "learning_rate": 9.999213224107358e-07, + "logits/chosen": 3.7265625, + "logits/rejected": 4.1015625, + "logps/chosen": -1183.0, + "logps/rejected": -843.0, + "loss": 0.4396, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.822265625, + "rewards/margins": 6.140625, + "rewards/rejected": -4.3046875, + "step": 532 + }, + { + "epoch": 0.10575921424673843, + "grad_norm": 42.87420355625388, + "learning_rate": 9.999153867018255e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 4.48046875, + "logps/chosen": -838.0, + "logps/rejected": -1269.0, + "loss": 0.5154, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.248046875, + "rewards/margins": 5.09765625, + "rewards/rejected": -3.84375, + "step": 533 + }, + { + "epoch": 0.10595763678753907, + "grad_norm": 44.6405373265354, + "learning_rate": 9.999092351764964e-07, + "logits/chosen": 4.546875, + "logits/rejected": 4.6171875, + "logps/chosen": -1036.0, + "logps/rejected": -1501.5, + "loss": 0.5083, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8251953125, + "rewards/margins": 5.80078125, + "rewards/rejected": -3.978515625, + "step": 534 + }, + { + "epoch": 0.1061560593283397, + "grad_norm": 49.0297872957324, + "learning_rate": 9.99902867837699e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.23046875, + "logps/chosen": -934.0, + "logps/rejected": -739.0, + "loss": 0.5409, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.06640625, + "rewards/margins": 3.8984375, + "rewards/rejected": -2.826171875, + "step": 535 + }, + { + "epoch": 0.10635448186914033, + "grad_norm": 48.94537752028232, + "learning_rate": 9.998962846884872e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 4.30859375, + "logps/chosen": -784.0, + "logps/rejected": -649.5, + "loss": 0.6455, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.52685546875, + "rewards/margins": 2.8359375, + "rewards/rejected": -2.30859375, + "step": 536 + }, + { + "epoch": 0.10655290440994097, + "grad_norm": 41.90376780106071, + "learning_rate": 9.998894857320197e-07, + "logits/chosen": 3.890625, + "logits/rejected": 4.203125, + "logps/chosen": -1033.0, + "logps/rejected": -602.0, + "loss": 0.5843, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.2197265625, + "rewards/margins": 3.345703125, + "rewards/rejected": -2.1240234375, + "step": 537 + }, + { + "epoch": 0.1067513269507416, + "grad_norm": 37.702560714701306, + "learning_rate": 9.99882470971557e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 4.01171875, + "logps/chosen": -563.5, + "logps/rejected": -855.0, + "loss": 0.6752, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.75146484375, + "rewards/margins": 3.150390625, + "rewards/rejected": -2.39453125, + "step": 538 + }, + { + "epoch": 0.10694974949154223, + "grad_norm": 40.23981703007417, + "learning_rate": 9.998752404104649e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 4.078125, + "logps/chosen": -992.0, + "logps/rejected": -655.5, + "loss": 0.5534, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.431640625, + "rewards/margins": 3.4296875, + "rewards/rejected": -2.001953125, + "step": 539 + }, + { + "epoch": 0.10714817203234288, + "grad_norm": 38.694713122221515, + "learning_rate": 9.99867794052211e-07, + "logits/chosen": 3.67578125, + "logits/rejected": 3.96875, + "logps/chosen": -1086.0, + "logps/rejected": -1230.0, + "loss": 0.4459, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.283203125, + "rewards/margins": 5.11328125, + "rewards/rejected": -3.830078125, + "step": 540 + }, + { + "epoch": 0.10734659457314351, + "grad_norm": 53.42684178256604, + "learning_rate": 9.998601319003673e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 4.0625, + "logps/chosen": -1044.0, + "logps/rejected": -976.0, + "loss": 0.5521, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.3291015625, + "rewards/margins": 4.67578125, + "rewards/rejected": -3.34375, + "step": 541 + }, + { + "epoch": 0.10754501711394414, + "grad_norm": 46.72344785724607, + "learning_rate": 9.998522539586093e-07, + "logits/chosen": 3.69921875, + "logits/rejected": 3.7578125, + "logps/chosen": -824.0, + "logps/rejected": -616.5, + "loss": 0.5867, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.93310546875, + "rewards/margins": 2.98828125, + "rewards/rejected": -2.0517578125, + "step": 542 + }, + { + "epoch": 0.10774343965474478, + "grad_norm": 49.31578098484427, + "learning_rate": 9.998441602307161e-07, + "logits/chosen": 3.84375, + "logits/rejected": 3.8203125, + "logps/chosen": -970.0, + "logps/rejected": -704.0, + "loss": 0.6151, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.87890625, + "rewards/margins": 2.896484375, + "rewards/rejected": -2.01611328125, + "step": 543 + }, + { + "epoch": 0.10794186219554541, + "grad_norm": 46.265657768871385, + "learning_rate": 9.998358507205698e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 4.01953125, + "logps/chosen": -688.5, + "logps/rejected": -477.0, + "loss": 0.5549, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.037109375, + "rewards/margins": 3.384765625, + "rewards/rejected": -2.345703125, + "step": 544 + }, + { + "epoch": 0.10814028473634604, + "grad_norm": 47.64346452609365, + "learning_rate": 9.998273254321567e-07, + "logits/chosen": 4.046875, + "logits/rejected": 4.0078125, + "logps/chosen": -852.0, + "logps/rejected": -750.5, + "loss": 0.5915, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.048828125, + "rewards/margins": 3.234375, + "rewards/rejected": -2.189453125, + "step": 545 + }, + { + "epoch": 0.10833870727714669, + "grad_norm": 43.83187554481503, + "learning_rate": 9.998185843695656e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 4.109375, + "logps/chosen": -1195.0, + "logps/rejected": -689.5, + "loss": 0.4771, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.123046875, + "rewards/margins": 3.77734375, + "rewards/rejected": -2.66015625, + "step": 546 + }, + { + "epoch": 0.10853712981794732, + "grad_norm": 43.73956817521345, + "learning_rate": 9.998096275369903e-07, + "logits/chosen": 3.5703125, + "logits/rejected": 4.0, + "logps/chosen": -689.5, + "logps/rejected": -529.5, + "loss": 0.6162, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.099609375, + "rewards/margins": 2.6875, + "rewards/rejected": -1.587890625, + "step": 547 + }, + { + "epoch": 0.10873555235874795, + "grad_norm": 68.83719635763136, + "learning_rate": 9.998004549387265e-07, + "logits/chosen": 3.58984375, + "logits/rejected": 3.8984375, + "logps/chosen": -1093.0, + "logps/rejected": -833.0, + "loss": 0.425, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.5810546875, + "rewards/margins": 4.734375, + "rewards/rejected": -3.14453125, + "step": 548 + }, + { + "epoch": 0.10893397489954859, + "grad_norm": 41.87829841651573, + "learning_rate": 9.997910665791744e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.515625, + "logps/chosen": -916.0, + "logps/rejected": -1254.0, + "loss": 0.4831, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.865234375, + "rewards/margins": 6.0078125, + "rewards/rejected": -4.138671875, + "step": 549 + }, + { + "epoch": 0.10913239744034922, + "grad_norm": 41.86829076454738, + "learning_rate": 9.997814624628373e-07, + "logits/chosen": 4.09375, + "logits/rejected": 3.8984375, + "logps/chosen": -1082.5, + "logps/rejected": -710.5, + "loss": 0.4898, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.458984375, + "rewards/margins": 3.62890625, + "rewards/rejected": -2.169921875, + "step": 550 + }, + { + "epoch": 0.10933081998114987, + "grad_norm": 37.296092316193416, + "learning_rate": 9.997716425943223e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 3.69140625, + "logps/chosen": -786.0, + "logps/rejected": -416.0, + "loss": 0.5392, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.2236328125, + "rewards/margins": 3.03515625, + "rewards/rejected": -1.814453125, + "step": 551 + }, + { + "epoch": 0.1095292425219505, + "grad_norm": 46.662786511207855, + "learning_rate": 9.9976160697834e-07, + "logits/chosen": 3.49609375, + "logits/rejected": 3.6484375, + "logps/chosen": -1021.0, + "logps/rejected": -722.0, + "loss": 0.5594, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.4462890625, + "rewards/margins": 3.64453125, + "rewards/rejected": -2.19091796875, + "step": 552 + }, + { + "epoch": 0.10972766506275113, + "grad_norm": 46.60541350338176, + "learning_rate": 9.997513556197041e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 4.046875, + "logps/chosen": -1076.0, + "logps/rejected": -612.0, + "loss": 0.4982, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.27783203125, + "rewards/margins": 3.8359375, + "rewards/rejected": -2.560546875, + "step": 553 + }, + { + "epoch": 0.10992608760355177, + "grad_norm": 42.490603460342065, + "learning_rate": 9.997408885233317e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.75, + "logps/chosen": -936.5, + "logps/rejected": -802.5, + "loss": 0.6265, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.9979248046875, + "rewards/margins": 4.43359375, + "rewards/rejected": -3.4375, + "step": 554 + }, + { + "epoch": 0.1101245101443524, + "grad_norm": 40.944748538434276, + "learning_rate": 9.99730205694244e-07, + "logits/chosen": 4.25, + "logits/rejected": 4.19140625, + "logps/chosen": -796.0, + "logps/rejected": -570.5, + "loss": 0.6838, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.7607421875, + "rewards/margins": 2.80859375, + "rewards/rejected": -2.046142578125, + "step": 555 + }, + { + "epoch": 0.11032293268515303, + "grad_norm": 48.56749812478717, + "learning_rate": 9.997193071375653e-07, + "logits/chosen": 3.953125, + "logits/rejected": 3.75, + "logps/chosen": -980.0, + "logps/rejected": -564.0, + "loss": 0.5916, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.56884765625, + "rewards/margins": 3.4765625, + "rewards/rejected": -2.9140625, + "step": 556 + }, + { + "epoch": 0.11052135522595367, + "grad_norm": 40.39276171555387, + "learning_rate": 9.997081928585236e-07, + "logits/chosen": 4.015625, + "logits/rejected": 4.19921875, + "logps/chosen": -926.0, + "logps/rejected": -876.0, + "loss": 0.5131, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.548828125, + "rewards/margins": 4.00390625, + "rewards/rejected": -2.455078125, + "step": 557 + }, + { + "epoch": 0.1107197777667543, + "grad_norm": 44.48206605512236, + "learning_rate": 9.9969686286245e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.17578125, + "logps/chosen": -925.0, + "logps/rejected": -542.0, + "loss": 0.5432, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.0338134765625, + "rewards/margins": 3.81640625, + "rewards/rejected": -2.779296875, + "step": 558 + }, + { + "epoch": 0.11091820030755493, + "grad_norm": 39.049154299020984, + "learning_rate": 9.996853171547793e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.0625, + "logps/chosen": -1126.0, + "logps/rejected": -748.5, + "loss": 0.5985, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.1328125, + "rewards/margins": 3.19921875, + "rewards/rejected": -2.068359375, + "step": 559 + }, + { + "epoch": 0.11111662284835558, + "grad_norm": 45.46279132437588, + "learning_rate": 9.996735557410499e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 4.40625, + "logps/chosen": -762.5, + "logps/rejected": -588.5, + "loss": 0.6365, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7418212890625, + "rewards/margins": 2.801513671875, + "rewards/rejected": -2.06494140625, + "step": 560 + }, + { + "epoch": 0.11131504538915621, + "grad_norm": 43.316560200638435, + "learning_rate": 9.996615786269034e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.9609375, + "logps/chosen": -1157.0, + "logps/rejected": -1366.0, + "loss": 0.4003, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.140625, + "rewards/margins": 5.6484375, + "rewards/rejected": -4.5078125, + "step": 561 + }, + { + "epoch": 0.11151346792995684, + "grad_norm": 48.56907940617945, + "learning_rate": 9.996493858180854e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 4.40625, + "logps/chosen": -791.0, + "logps/rejected": -993.0, + "loss": 0.6207, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.8349609375, + "rewards/margins": 3.3046875, + "rewards/rejected": -2.466796875, + "step": 562 + }, + { + "epoch": 0.11171189047075748, + "grad_norm": 45.01345673809866, + "learning_rate": 9.99636977320444e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 4.1171875, + "logps/chosen": -1442.0, + "logps/rejected": -825.0, + "loss": 0.3803, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.931640625, + "rewards/margins": 4.98828125, + "rewards/rejected": -3.060546875, + "step": 563 + }, + { + "epoch": 0.11191031301155811, + "grad_norm": 44.28338301367185, + "learning_rate": 9.996243531399316e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.17578125, + "logps/chosen": -1114.0, + "logps/rejected": -696.5, + "loss": 0.5331, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.2158203125, + "rewards/margins": 3.2421875, + "rewards/rejected": -2.0322265625, + "step": 564 + }, + { + "epoch": 0.11210873555235874, + "grad_norm": 45.701055382796994, + "learning_rate": 9.99611513282604e-07, + "logits/chosen": 4.31640625, + "logits/rejected": 4.41796875, + "logps/chosen": -1090.0, + "logps/rejected": -664.5, + "loss": 0.5622, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.71337890625, + "rewards/margins": 3.90625, + "rewards/rejected": -3.19140625, + "step": 565 + }, + { + "epoch": 0.11230715809315939, + "grad_norm": 38.874509505582296, + "learning_rate": 9.995984577546197e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.33984375, + "logps/chosen": -922.0, + "logps/rejected": -669.0, + "loss": 0.4442, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.4140625, + "rewards/margins": 4.0625, + "rewards/rejected": -2.65234375, + "step": 566 + }, + { + "epoch": 0.11250558063396002, + "grad_norm": 44.374730217247375, + "learning_rate": 9.995851865622418e-07, + "logits/chosen": 3.76171875, + "logits/rejected": 3.7734375, + "logps/chosen": -782.5, + "logps/rejected": -515.0, + "loss": 0.51, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.2177734375, + "rewards/margins": 3.3515625, + "rewards/rejected": -2.134765625, + "step": 567 + }, + { + "epoch": 0.11270400317476065, + "grad_norm": 42.69174268518988, + "learning_rate": 9.99571699711836e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 4.19921875, + "logps/chosen": -759.0, + "logps/rejected": -1090.0, + "loss": 0.542, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.00634765625, + "rewards/margins": 4.5859375, + "rewards/rejected": -3.5859375, + "step": 568 + }, + { + "epoch": 0.11290242571556129, + "grad_norm": 40.806997012921876, + "learning_rate": 9.995579972098716e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.265625, + "logps/chosen": -1097.5, + "logps/rejected": -726.0, + "loss": 0.4962, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.477294921875, + "rewards/margins": 3.734375, + "rewards/rejected": -2.25390625, + "step": 569 + }, + { + "epoch": 0.11310084825636192, + "grad_norm": 46.833597601424444, + "learning_rate": 9.995440790629217e-07, + "logits/chosen": 4.3515625, + "logits/rejected": 4.51953125, + "logps/chosen": -991.0, + "logps/rejected": -802.0, + "loss": 0.6066, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.251953125, + "rewards/margins": 9.890625, + "rewards/rejected": -8.619140625, + "step": 570 + }, + { + "epoch": 0.11329927079716255, + "grad_norm": 40.582046078872175, + "learning_rate": 9.995299452776625e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 4.140625, + "logps/chosen": -1332.0, + "logps/rejected": -1390.0, + "loss": 0.4394, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.900390625, + "rewards/margins": 5.890625, + "rewards/rejected": -3.990234375, + "step": 571 + }, + { + "epoch": 0.1134976933379632, + "grad_norm": 41.66737086240201, + "learning_rate": 9.995155958608735e-07, + "logits/chosen": 4.19921875, + "logits/rejected": 4.05859375, + "logps/chosen": -1185.0, + "logps/rejected": -762.5, + "loss": 0.4292, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.7841796875, + "rewards/margins": 4.515625, + "rewards/rejected": -2.7373046875, + "step": 572 + }, + { + "epoch": 0.11369611587876383, + "grad_norm": 61.925619377528335, + "learning_rate": 9.99501030819438e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 4.00390625, + "logps/chosen": -1080.0, + "logps/rejected": -708.0, + "loss": 0.5443, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2060546875, + "rewards/margins": 4.515625, + "rewards/rejected": -3.298828125, + "step": 573 + }, + { + "epoch": 0.11389453841956446, + "grad_norm": 45.548385316537825, + "learning_rate": 9.99486250160343e-07, + "logits/chosen": 4.46875, + "logits/rejected": 4.5546875, + "logps/chosen": -816.5, + "logps/rejected": -638.5, + "loss": 0.4316, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.728515625, + "rewards/margins": 4.64453125, + "rewards/rejected": -2.92578125, + "step": 574 + }, + { + "epoch": 0.1140929609603651, + "grad_norm": 45.44413531550036, + "learning_rate": 9.994712538906775e-07, + "logits/chosen": 4.5859375, + "logits/rejected": 4.8828125, + "logps/chosen": -777.0, + "logps/rejected": -577.0, + "loss": 0.4763, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1376953125, + "rewards/margins": 3.703125, + "rewards/rejected": -2.5625, + "step": 575 + }, + { + "epoch": 0.11429138350116573, + "grad_norm": 39.87176298645287, + "learning_rate": 9.994560420176362e-07, + "logits/chosen": 4.25, + "logits/rejected": 4.26171875, + "logps/chosen": -908.0, + "logps/rejected": -507.0, + "loss": 0.5176, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.9820556640625, + "rewards/margins": 3.828125, + "rewards/rejected": -2.84765625, + "step": 576 + }, + { + "epoch": 0.11448980604196636, + "grad_norm": 40.61441025571986, + "learning_rate": 9.994406145485149e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 4.25390625, + "logps/chosen": -1090.0, + "logps/rejected": -648.0, + "loss": 0.3902, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.2685546875, + "rewards/margins": 5.0625, + "rewards/rejected": -3.796875, + "step": 577 + }, + { + "epoch": 0.114688228582767, + "grad_norm": 39.59563881912077, + "learning_rate": 9.994249714907147e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.70703125, + "logps/chosen": -939.0, + "logps/rejected": -626.0, + "loss": 0.5396, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.91796875, + "rewards/margins": 3.5234375, + "rewards/rejected": -2.6015625, + "step": 578 + }, + { + "epoch": 0.11488665112356763, + "grad_norm": 43.00589889310554, + "learning_rate": 9.994091128517388e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.3671875, + "logps/chosen": -1049.5, + "logps/rejected": -917.0, + "loss": 0.5119, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.5791015625, + "rewards/margins": 4.626953125, + "rewards/rejected": -3.03515625, + "step": 579 + }, + { + "epoch": 0.11508507366436828, + "grad_norm": 42.50858378862667, + "learning_rate": 9.993930386391944e-07, + "logits/chosen": 3.67578125, + "logits/rejected": 4.4765625, + "logps/chosen": -1129.0, + "logps/rejected": -1465.0, + "loss": 0.5282, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.5029296875, + "rewards/margins": 5.89453125, + "rewards/rejected": -4.38671875, + "step": 580 + }, + { + "epoch": 0.11528349620516891, + "grad_norm": 45.129994401444286, + "learning_rate": 9.99376748860792e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.453125, + "logps/chosen": -1240.0, + "logps/rejected": -962.0, + "loss": 0.4131, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.5947265625, + "rewards/margins": 5.46875, + "rewards/rejected": -3.87109375, + "step": 581 + }, + { + "epoch": 0.11548191874596954, + "grad_norm": 42.8811593022709, + "learning_rate": 9.993602435243458e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.44921875, + "logps/chosen": -1211.0, + "logps/rejected": -668.0, + "loss": 0.5286, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1240234375, + "rewards/margins": 3.71484375, + "rewards/rejected": -2.58984375, + "step": 582 + }, + { + "epoch": 0.11568034128677018, + "grad_norm": 43.3751212771323, + "learning_rate": 9.993435226377726e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.05078125, + "logps/chosen": -1185.0, + "logps/rejected": -848.0, + "loss": 0.5517, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.3316650390625, + "rewards/margins": 4.44921875, + "rewards/rejected": -3.1171875, + "step": 583 + }, + { + "epoch": 0.11587876382757081, + "grad_norm": 42.05015830377148, + "learning_rate": 9.993265862090937e-07, + "logits/chosen": 3.5546875, + "logits/rejected": 3.75390625, + "logps/chosen": -848.0, + "logps/rejected": -572.0, + "loss": 0.6093, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.03515625, + "rewards/margins": 3.84765625, + "rewards/rejected": -2.806640625, + "step": 584 + }, + { + "epoch": 0.11607718636837144, + "grad_norm": 41.43253694261935, + "learning_rate": 9.993094342464328e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.41015625, + "logps/chosen": -926.0, + "logps/rejected": -671.0, + "loss": 0.4812, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.240234375, + "rewards/margins": 4.13671875, + "rewards/rejected": -2.890625, + "step": 585 + }, + { + "epoch": 0.11627560890917209, + "grad_norm": 52.25118278728839, + "learning_rate": 9.992920667580175e-07, + "logits/chosen": 3.78515625, + "logits/rejected": 4.06640625, + "logps/chosen": -893.0, + "logps/rejected": -592.5, + "loss": 0.6527, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.521728515625, + "rewards/margins": 2.8505859375, + "rewards/rejected": -2.32421875, + "step": 586 + }, + { + "epoch": 0.11647403144997272, + "grad_norm": 44.328824154097404, + "learning_rate": 9.992744837521786e-07, + "logits/chosen": 3.5625, + "logits/rejected": 3.7421875, + "logps/chosen": -951.0, + "logps/rejected": -973.5, + "loss": 0.5362, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.97607421875, + "rewards/margins": 4.3125, + "rewards/rejected": -3.3359375, + "step": 587 + }, + { + "epoch": 0.11667245399077335, + "grad_norm": 42.52695613695543, + "learning_rate": 9.992566852373505e-07, + "logits/chosen": 4.0, + "logits/rejected": 3.8359375, + "logps/chosen": -1200.0, + "logps/rejected": -686.5, + "loss": 0.464, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.55078125, + "rewards/margins": 3.69140625, + "rewards/rejected": -2.14453125, + "step": 588 + }, + { + "epoch": 0.11687087653157399, + "grad_norm": 41.376969588502924, + "learning_rate": 9.992386712220707e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 4.25390625, + "logps/chosen": -885.0, + "logps/rejected": -1539.0, + "loss": 0.523, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.79541015625, + "rewards/margins": 5.03515625, + "rewards/rejected": -4.23828125, + "step": 589 + }, + { + "epoch": 0.11706929907237462, + "grad_norm": 51.55248345919678, + "learning_rate": 9.992204417149801e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.23828125, + "logps/chosen": -1104.0, + "logps/rejected": -734.5, + "loss": 0.5261, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.958984375, + "rewards/margins": 4.0859375, + "rewards/rejected": -3.1328125, + "step": 590 + }, + { + "epoch": 0.11726772161317525, + "grad_norm": 43.04763437372041, + "learning_rate": 9.992019967248236e-07, + "logits/chosen": 3.87109375, + "logits/rejected": 3.96484375, + "logps/chosen": -1163.0, + "logps/rejected": -770.0, + "loss": 0.4777, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.052734375, + "rewards/margins": 4.01953125, + "rewards/rejected": -2.96484375, + "step": 591 + }, + { + "epoch": 0.1174661441539759, + "grad_norm": 36.189072437719766, + "learning_rate": 9.991833362604485e-07, + "logits/chosen": 4.3828125, + "logits/rejected": 4.1484375, + "logps/chosen": -1023.0, + "logps/rejected": -726.0, + "loss": 0.3497, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.94921875, + "rewards/margins": 5.328125, + "rewards/rejected": -3.37890625, + "step": 592 + }, + { + "epoch": 0.11766456669477653, + "grad_norm": 38.25236120523357, + "learning_rate": 9.991644603308057e-07, + "logits/chosen": 4.51171875, + "logits/rejected": 4.6171875, + "logps/chosen": -1140.5, + "logps/rejected": -719.5, + "loss": 0.5309, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.51513671875, + "rewards/margins": 4.013671875, + "rewards/rejected": -2.4990234375, + "step": 593 + }, + { + "epoch": 0.11786298923557716, + "grad_norm": 46.316008718476795, + "learning_rate": 9.9914536894495e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 4.03125, + "logps/chosen": -1132.0, + "logps/rejected": -679.0, + "loss": 0.4648, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.27734375, + "rewards/margins": 3.98046875, + "rewards/rejected": -2.70703125, + "step": 594 + }, + { + "epoch": 0.1180614117763778, + "grad_norm": 51.96405007439854, + "learning_rate": 9.991260621120394e-07, + "logits/chosen": 4.42578125, + "logits/rejected": 4.5546875, + "logps/chosen": -906.0, + "logps/rejected": -2212.0, + "loss": 0.4755, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.37939453125, + "rewards/margins": 6.23828125, + "rewards/rejected": -4.8671875, + "step": 595 + }, + { + "epoch": 0.11825983431717843, + "grad_norm": 47.71112377812809, + "learning_rate": 9.991065398413343e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.1640625, + "logps/chosen": -794.0, + "logps/rejected": -693.0, + "loss": 0.5967, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.90234375, + "rewards/margins": 3.0625, + "rewards/rejected": -2.16015625, + "step": 596 + }, + { + "epoch": 0.11845825685797906, + "grad_norm": 38.138723565236354, + "learning_rate": 9.990868021422e-07, + "logits/chosen": 4.33984375, + "logits/rejected": 4.6015625, + "logps/chosen": -1083.0, + "logps/rejected": -1180.0, + "loss": 0.5072, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.50927734375, + "rewards/margins": 4.625, + "rewards/rejected": -3.1142578125, + "step": 597 + }, + { + "epoch": 0.1186566793987797, + "grad_norm": 40.64064553152967, + "learning_rate": 9.990668490241038e-07, + "logits/chosen": 4.6875, + "logits/rejected": 4.5625, + "logps/chosen": -1106.0, + "logps/rejected": -784.0, + "loss": 0.5148, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4208984375, + "rewards/margins": 3.77734375, + "rewards/rejected": -2.357421875, + "step": 598 + }, + { + "epoch": 0.11885510193958033, + "grad_norm": 41.148457955156786, + "learning_rate": 9.990466804966171e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.30078125, + "logps/chosen": -1178.0, + "logps/rejected": -781.0, + "loss": 0.4599, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.033203125, + "rewards/margins": 4.20703125, + "rewards/rejected": -2.17578125, + "step": 599 + }, + { + "epoch": 0.11905352448038097, + "grad_norm": 48.3070208842317, + "learning_rate": 9.990262965694143e-07, + "logits/chosen": 4.50390625, + "logits/rejected": 4.375, + "logps/chosen": -1057.0, + "logps/rejected": -942.0, + "loss": 0.3911, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.30078125, + "rewards/margins": 5.140625, + "rewards/rejected": -2.853515625, + "step": 600 + }, + { + "epoch": 0.11925194702118161, + "grad_norm": 37.075358806175345, + "learning_rate": 9.990056972522732e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.2265625, + "logps/chosen": -1065.0, + "logps/rejected": -604.5, + "loss": 0.4861, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.357421875, + "rewards/margins": 3.78125, + "rewards/rejected": -2.4296875, + "step": 601 + }, + { + "epoch": 0.11945036956198224, + "grad_norm": 42.925229028942695, + "learning_rate": 9.98984882555075e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.109375, + "logps/chosen": -1083.0, + "logps/rejected": -655.0, + "loss": 0.5157, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.140625, + "rewards/margins": 3.8359375, + "rewards/rejected": -2.6953125, + "step": 602 + }, + { + "epoch": 0.11964879210278287, + "grad_norm": 38.58223808346466, + "learning_rate": 9.98963852487804e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.38671875, + "logps/chosen": -1240.0, + "logps/rejected": -741.5, + "loss": 0.4029, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.86328125, + "rewards/margins": 4.8984375, + "rewards/rejected": -3.04296875, + "step": 603 + }, + { + "epoch": 0.11984721464358351, + "grad_norm": 45.76687234027917, + "learning_rate": 9.98942607060548e-07, + "logits/chosen": 4.23046875, + "logits/rejected": 4.1640625, + "logps/chosen": -1050.0, + "logps/rejected": -652.0, + "loss": 0.5507, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.921875, + "rewards/margins": 3.9921875, + "rewards/rejected": -3.07421875, + "step": 604 + }, + { + "epoch": 0.12004563718438414, + "grad_norm": 51.7440056127684, + "learning_rate": 9.98921146283498e-07, + "logits/chosen": 4.65625, + "logits/rejected": 4.3515625, + "logps/chosen": -838.0, + "logps/rejected": -654.0, + "loss": 0.5861, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.0888671875, + "rewards/margins": 3.98046875, + "rewards/rejected": -2.8896484375, + "step": 605 + }, + { + "epoch": 0.12024405972518479, + "grad_norm": 41.31575493538002, + "learning_rate": 9.988994701669486e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.23828125, + "logps/chosen": -1230.0, + "logps/rejected": -736.0, + "loss": 0.4216, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.990234375, + "rewards/margins": 5.1171875, + "rewards/rejected": -3.11328125, + "step": 606 + }, + { + "epoch": 0.12044248226598542, + "grad_norm": 35.12153431848152, + "learning_rate": 9.988775787212973e-07, + "logits/chosen": 3.51171875, + "logits/rejected": 4.0625, + "logps/chosen": -1027.0, + "logps/rejected": -723.0, + "loss": 0.5755, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.1435546875, + "rewards/margins": 4.46875, + "rewards/rejected": -3.328125, + "step": 607 + }, + { + "epoch": 0.12064090480678605, + "grad_norm": 41.18216548705737, + "learning_rate": 9.98855471957045e-07, + "logits/chosen": 3.71484375, + "logits/rejected": 4.171875, + "logps/chosen": -966.5, + "logps/rejected": -1430.0, + "loss": 0.5275, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.4248046875, + "rewards/margins": 6.125, + "rewards/rejected": -4.7109375, + "step": 608 + }, + { + "epoch": 0.12083932734758669, + "grad_norm": 41.1951643665037, + "learning_rate": 9.988331498847957e-07, + "logits/chosen": 3.62890625, + "logits/rejected": 3.83203125, + "logps/chosen": -1130.0, + "logps/rejected": -759.0, + "loss": 0.583, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.21484375, + "rewards/margins": 3.9296875, + "rewards/rejected": -2.705078125, + "step": 609 + }, + { + "epoch": 0.12103774988838732, + "grad_norm": 39.416219380700596, + "learning_rate": 9.988106125152574e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 4.1484375, + "logps/chosen": -1552.5, + "logps/rejected": -573.25, + "loss": 0.4334, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29296875, + "rewards/margins": 3.234375, + "rewards/rejected": -3.53515625, + "step": 610 + }, + { + "epoch": 0.12123617242918795, + "grad_norm": 37.369793428372816, + "learning_rate": 9.987878598592406e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.4921875, + "logps/chosen": -683.0, + "logps/rejected": -587.0, + "loss": 0.6199, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.890625, + "rewards/margins": 10.205078125, + "rewards/rejected": -9.3046875, + "step": 611 + }, + { + "epoch": 0.1214345949699886, + "grad_norm": 39.71926155123333, + "learning_rate": 9.987648919276594e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 4.0859375, + "logps/chosen": -920.0, + "logps/rejected": -1060.0, + "loss": 0.5387, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.291015625, + "rewards/margins": 4.09375, + "rewards/rejected": -2.796875, + "step": 612 + }, + { + "epoch": 0.12163301751078923, + "grad_norm": 44.217842026791494, + "learning_rate": 9.98741708731531e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 4.1171875, + "logps/chosen": -1065.0, + "logps/rejected": -1381.5, + "loss": 0.4954, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.296142578125, + "rewards/margins": 4.4453125, + "rewards/rejected": -3.142578125, + "step": 613 + }, + { + "epoch": 0.12183144005158986, + "grad_norm": 38.693771266794016, + "learning_rate": 9.98718310281976e-07, + "logits/chosen": 3.66796875, + "logits/rejected": 4.01953125, + "logps/chosen": -823.0, + "logps/rejected": -1102.0, + "loss": 0.6166, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.115234375, + "rewards/margins": 4.28515625, + "rewards/rejected": -3.1669921875, + "step": 614 + }, + { + "epoch": 0.1220298625923905, + "grad_norm": 39.98934648074984, + "learning_rate": 9.986946965902184e-07, + "logits/chosen": 4.0, + "logits/rejected": 4.3828125, + "logps/chosen": -939.0, + "logps/rejected": -1062.0, + "loss": 0.5408, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.458984375, + "rewards/margins": 4.8515625, + "rewards/rejected": -3.39453125, + "step": 615 + }, + { + "epoch": 0.12222828513319113, + "grad_norm": 37.411868919975454, + "learning_rate": 9.98670867667585e-07, + "logits/chosen": 3.96875, + "logits/rejected": 4.23828125, + "logps/chosen": -1004.0, + "logps/rejected": -1213.5, + "loss": 0.5523, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.17578125, + "rewards/margins": 4.30078125, + "rewards/rejected": -3.12890625, + "step": 616 + }, + { + "epoch": 0.12242670767399176, + "grad_norm": 47.14929517944319, + "learning_rate": 9.986468235255064e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.2890625, + "logps/chosen": -747.0, + "logps/rejected": -864.5, + "loss": 0.4317, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.45703125, + "rewards/margins": 5.3671875, + "rewards/rejected": -2.90625, + "step": 617 + }, + { + "epoch": 0.1226251302147924, + "grad_norm": 36.518523625586916, + "learning_rate": 9.986225641755158e-07, + "logits/chosen": 4.609375, + "logits/rejected": 4.42578125, + "logps/chosen": -852.0, + "logps/rejected": -640.0, + "loss": 0.6402, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.3251953125, + "rewards/margins": 2.4921875, + "rewards/rejected": -1.1611328125, + "step": 618 + }, + { + "epoch": 0.12282355275559304, + "grad_norm": 60.55838723258358, + "learning_rate": 9.985980896292504e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.13671875, + "logps/chosen": -1336.0, + "logps/rejected": -792.0, + "loss": 0.497, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.064453125, + "rewards/margins": 3.984375, + "rewards/rejected": -1.923828125, + "step": 619 + }, + { + "epoch": 0.12302197529639367, + "grad_norm": 38.58485143612145, + "learning_rate": 9.985733998984498e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.04296875, + "logps/chosen": -1312.0, + "logps/rejected": -845.5, + "loss": 0.4983, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.6328125, + "rewards/margins": 4.43359375, + "rewards/rejected": -2.80078125, + "step": 620 + }, + { + "epoch": 0.12322039783719431, + "grad_norm": 60.354782098418625, + "learning_rate": 9.985484949949575e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.1328125, + "logps/chosen": -1316.0, + "logps/rejected": -789.0, + "loss": 0.5086, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.0390625, + "rewards/margins": 4.35546875, + "rewards/rejected": -2.3125, + "step": 621 + }, + { + "epoch": 0.12341882037799494, + "grad_norm": 55.23956033737703, + "learning_rate": 9.985233749307197e-07, + "logits/chosen": 4.46484375, + "logits/rejected": 4.54296875, + "logps/chosen": -1091.5, + "logps/rejected": -748.0, + "loss": 0.4289, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.90234375, + "rewards/margins": 4.953125, + "rewards/rejected": -3.05859375, + "step": 622 + }, + { + "epoch": 0.12361724291879557, + "grad_norm": 79.1533794806616, + "learning_rate": 9.984980397177863e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.3984375, + "logps/chosen": -1230.0, + "logps/rejected": -622.0, + "loss": 0.5323, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.3388671875, + "rewards/margins": 4.2734375, + "rewards/rejected": -2.93359375, + "step": 623 + }, + { + "epoch": 0.12381566545959621, + "grad_norm": 42.69495382997433, + "learning_rate": 9.9847248936831e-07, + "logits/chosen": 4.3515625, + "logits/rejected": 4.36328125, + "logps/chosen": -892.5, + "logps/rejected": -439.0, + "loss": 0.7311, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7548828125, + "rewards/margins": 2.1632080078125, + "rewards/rejected": -1.408447265625, + "step": 624 + }, + { + "epoch": 0.12401408800039684, + "grad_norm": 45.86651656704757, + "learning_rate": 9.984467238945469e-07, + "logits/chosen": 3.66796875, + "logits/rejected": 3.93359375, + "logps/chosen": -1299.0, + "logps/rejected": -731.0, + "loss": 0.4676, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.90234375, + "rewards/margins": 4.78125, + "rewards/rejected": -2.884765625, + "step": 625 + }, + { + "epoch": 0.12421251054119747, + "grad_norm": 37.81117370026823, + "learning_rate": 9.984207433088564e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.6953125, + "logps/chosen": -1049.0, + "logps/rejected": -1875.0, + "loss": 0.4946, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.4140625, + "rewards/margins": 6.6171875, + "rewards/rejected": -5.19921875, + "step": 626 + }, + { + "epoch": 0.12441093308199812, + "grad_norm": 33.810202590403414, + "learning_rate": 9.983945476237005e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.30859375, + "logps/chosen": -974.0, + "logps/rejected": -447.5, + "loss": 0.5534, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.5068359375, + "rewards/margins": 3.6796875, + "rewards/rejected": -2.171875, + "step": 627 + }, + { + "epoch": 0.12460935562279875, + "grad_norm": 49.67277287797323, + "learning_rate": 9.983681368516451e-07, + "logits/chosen": 3.87109375, + "logits/rejected": 3.91015625, + "logps/chosen": -947.0, + "logps/rejected": -996.0, + "loss": 0.4829, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.17578125, + "rewards/margins": 5.4296875, + "rewards/rejected": -3.25390625, + "step": 628 + }, + { + "epoch": 0.12480777816359938, + "grad_norm": 44.802446279046585, + "learning_rate": 9.98341511005359e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 4.19140625, + "logps/chosen": -1023.0, + "logps/rejected": -791.25, + "loss": 0.5118, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.8720703125, + "rewards/margins": 5.109375, + "rewards/rejected": -4.234375, + "step": 629 + }, + { + "epoch": 0.12500620070440002, + "grad_norm": 43.529640710309366, + "learning_rate": 9.98314670097614e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 4.21875, + "logps/chosen": -1077.0, + "logps/rejected": -1278.0, + "loss": 0.5329, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.1357421875, + "rewards/margins": 5.484375, + "rewards/rejected": -4.34375, + "step": 630 + }, + { + "epoch": 0.12520462324520065, + "grad_norm": 42.10319563986696, + "learning_rate": 9.982876141412855e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 4.2734375, + "logps/chosen": -1100.0, + "logps/rejected": -775.0, + "loss": 0.5498, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.777587890625, + "rewards/margins": 5.046875, + "rewards/rejected": -4.265625, + "step": 631 + }, + { + "epoch": 0.12540304578600128, + "grad_norm": 52.70223117922741, + "learning_rate": 9.982603431493515e-07, + "logits/chosen": 3.71484375, + "logits/rejected": 3.828125, + "logps/chosen": -1077.0, + "logps/rejected": -937.0, + "loss": 0.6318, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6015625, + "rewards/margins": 3.62109375, + "rewards/rejected": -3.01953125, + "step": 632 + }, + { + "epoch": 0.1256014683268019, + "grad_norm": 46.954581508668255, + "learning_rate": 9.982328571348933e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.00390625, + "logps/chosen": -987.0, + "logps/rejected": -689.5, + "loss": 0.5186, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.3271484375, + "rewards/margins": 4.8203125, + "rewards/rejected": -3.5, + "step": 633 + }, + { + "epoch": 0.12579989086760257, + "grad_norm": 36.099234805557614, + "learning_rate": 9.982051561110957e-07, + "logits/chosen": 4.625, + "logits/rejected": 4.41796875, + "logps/chosen": -1588.0, + "logps/rejected": -673.5, + "loss": 0.4085, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7890625, + "rewards/margins": 6.04296875, + "rewards/rejected": -3.2578125, + "step": 634 + }, + { + "epoch": 0.1259983134084032, + "grad_norm": 49.72852825590199, + "learning_rate": 9.981772400912463e-07, + "logits/chosen": 4.0625, + "logits/rejected": 4.0546875, + "logps/chosen": -919.0, + "logps/rejected": -571.0, + "loss": 0.506, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.076171875, + "rewards/margins": 3.984375, + "rewards/rejected": -2.9140625, + "step": 635 + }, + { + "epoch": 0.12619673594920383, + "grad_norm": 43.158419242484456, + "learning_rate": 9.981491090887358e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 4.02734375, + "logps/chosen": -896.5, + "logps/rejected": -651.0, + "loss": 0.5461, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.259765625, + "rewards/margins": 3.48046875, + "rewards/rejected": -2.2265625, + "step": 636 + }, + { + "epoch": 0.12639515849000446, + "grad_norm": 37.99728840960124, + "learning_rate": 9.981207631170585e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.48046875, + "logps/chosen": -831.5, + "logps/rejected": -726.0, + "loss": 0.5219, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.306640625, + "rewards/margins": 4.296875, + "rewards/rejected": -2.99609375, + "step": 637 + }, + { + "epoch": 0.1265935810308051, + "grad_norm": 37.3858819066498, + "learning_rate": 9.98092202189811e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 4.28515625, + "logps/chosen": -972.0, + "logps/rejected": -874.0, + "loss": 0.6188, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.7421875, + "rewards/margins": 4.453125, + "rewards/rejected": -2.705078125, + "step": 638 + }, + { + "epoch": 0.12679200357160572, + "grad_norm": 41.06715422571255, + "learning_rate": 9.980634263206938e-07, + "logits/chosen": 4.40234375, + "logits/rejected": 4.44921875, + "logps/chosen": -985.0, + "logps/rejected": -654.0, + "loss": 0.5962, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.515625, + "rewards/margins": 3.69921875, + "rewards/rejected": -2.185546875, + "step": 639 + }, + { + "epoch": 0.12699042611240638, + "grad_norm": 45.38346426034655, + "learning_rate": 9.9803443552351e-07, + "logits/chosen": 4.14453125, + "logits/rejected": 4.125, + "logps/chosen": -1206.0, + "logps/rejected": -728.5, + "loss": 0.4264, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.818359375, + "rewards/margins": 4.74609375, + "rewards/rejected": -2.93359375, + "step": 640 + }, + { + "epoch": 0.127188848653207, + "grad_norm": 40.45788285608343, + "learning_rate": 9.980052298121664e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.20703125, + "logps/chosen": -1016.0, + "logps/rejected": -619.5, + "loss": 0.5599, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.154296875, + "rewards/margins": 3.2421875, + "rewards/rejected": -2.08203125, + "step": 641 + }, + { + "epoch": 0.12738727119400764, + "grad_norm": 43.26927637681219, + "learning_rate": 9.979758092006717e-07, + "logits/chosen": 4.4921875, + "logits/rejected": 4.453125, + "logps/chosen": -1052.0, + "logps/rejected": -590.0, + "loss": 0.5464, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.65234375, + "rewards/margins": 3.625, + "rewards/rejected": -1.97705078125, + "step": 642 + }, + { + "epoch": 0.12758569373480827, + "grad_norm": 47.913356977992784, + "learning_rate": 9.979461737031389e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 4.171875, + "logps/chosen": -965.0, + "logps/rejected": -783.0, + "loss": 0.4788, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.41796875, + "rewards/margins": 4.20703125, + "rewards/rejected": -2.779296875, + "step": 643 + }, + { + "epoch": 0.1277841162756089, + "grad_norm": 40.28794840371295, + "learning_rate": 9.979163233337835e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 4.0390625, + "logps/chosen": -1214.0, + "logps/rejected": -818.0, + "loss": 0.4796, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.400390625, + "rewards/margins": 4.98828125, + "rewards/rejected": -3.5859375, + "step": 644 + }, + { + "epoch": 0.12798253881640956, + "grad_norm": 40.71843974697876, + "learning_rate": 9.978862581069245e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.265625, + "logps/chosen": -1106.0, + "logps/rejected": -752.0, + "loss": 0.4513, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.2861328125, + "rewards/margins": 4.34375, + "rewards/rejected": -3.05859375, + "step": 645 + }, + { + "epoch": 0.1281809613572102, + "grad_norm": 41.30910171576697, + "learning_rate": 9.978559780369833e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.01171875, + "logps/chosen": -850.5, + "logps/rejected": -861.0, + "loss": 0.5615, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2667236328125, + "rewards/margins": 4.1875, + "rewards/rejected": -2.923828125, + "step": 646 + }, + { + "epoch": 0.12837938389801082, + "grad_norm": 46.045089335674156, + "learning_rate": 9.978254831384848e-07, + "logits/chosen": 4.46875, + "logits/rejected": 4.5703125, + "logps/chosen": -1001.0, + "logps/rejected": -752.0, + "loss": 0.4373, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.8515625, + "rewards/margins": 8.265625, + "rewards/rejected": -6.44921875, + "step": 647 + }, + { + "epoch": 0.12857780643881145, + "grad_norm": 39.605281009410774, + "learning_rate": 9.97794773426057e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 3.9921875, + "logps/chosen": -961.0, + "logps/rejected": -804.75, + "loss": 0.4892, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.3251953125, + "rewards/margins": 4.109375, + "rewards/rejected": -2.78515625, + "step": 648 + }, + { + "epoch": 0.12877622897961208, + "grad_norm": 42.13795903214708, + "learning_rate": 9.977638489144307e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.50390625, + "logps/chosen": -958.0, + "logps/rejected": -800.0, + "loss": 0.4535, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4453125, + "rewards/margins": 4.359375, + "rewards/rejected": -2.91796875, + "step": 649 + }, + { + "epoch": 0.1289746515204127, + "grad_norm": 45.69598947254104, + "learning_rate": 9.977327096184397e-07, + "logits/chosen": 3.78515625, + "logits/rejected": 3.72265625, + "logps/chosen": -1085.0, + "logps/rejected": -730.0, + "loss": 0.4196, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.8515625, + "rewards/margins": 5.3671875, + "rewards/rejected": -3.529296875, + "step": 650 + }, + { + "epoch": 0.12917307406121337, + "grad_norm": 48.5286196036089, + "learning_rate": 9.97701355553021e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 3.6640625, + "logps/chosen": -1165.0, + "logps/rejected": -873.0, + "loss": 0.4217, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.44921875, + "rewards/margins": 11.9453125, + "rewards/rejected": -10.4453125, + "step": 651 + }, + { + "epoch": 0.129371496602014, + "grad_norm": 35.18372040167906, + "learning_rate": 9.976697867332149e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.6640625, + "logps/chosen": -1152.0, + "logps/rejected": -756.5, + "loss": 0.4363, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.953125, + "rewards/margins": 4.94140625, + "rewards/rejected": -2.984375, + "step": 652 + }, + { + "epoch": 0.12956991914281463, + "grad_norm": 33.97932936926832, + "learning_rate": 9.97638003174164e-07, + "logits/chosen": 4.3515625, + "logits/rejected": 4.3671875, + "logps/chosen": -741.0, + "logps/rejected": -537.5, + "loss": 0.4507, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.232421875, + "rewards/margins": 4.21484375, + "rewards/rejected": -2.978515625, + "step": 653 + }, + { + "epoch": 0.12976834168361526, + "grad_norm": 34.76993077006796, + "learning_rate": 9.976060048911146e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.5, + "logps/chosen": -868.0, + "logps/rejected": -630.0, + "loss": 0.4346, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6953125, + "rewards/margins": 4.28515625, + "rewards/rejected": -2.58349609375, + "step": 654 + }, + { + "epoch": 0.1299667642244159, + "grad_norm": 40.88399595926102, + "learning_rate": 9.975737918994157e-07, + "logits/chosen": 3.671875, + "logits/rejected": 3.890625, + "logps/chosen": -965.0, + "logps/rejected": -634.0, + "loss": 0.4785, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.755859375, + "rewards/margins": 4.32421875, + "rewards/rejected": -2.5703125, + "step": 655 + }, + { + "epoch": 0.13016518676521652, + "grad_norm": 47.378691378613425, + "learning_rate": 9.97541364214519e-07, + "logits/chosen": 3.7265625, + "logits/rejected": 3.796875, + "logps/chosen": -1080.0, + "logps/rejected": -580.0, + "loss": 0.4279, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.41015625, + "rewards/margins": 4.484375, + "rewards/rejected": -3.0703125, + "step": 656 + }, + { + "epoch": 0.13036360930601718, + "grad_norm": 35.29743674950195, + "learning_rate": 9.975087218519798e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 4.02734375, + "logps/chosen": -1022.0, + "logps/rejected": -560.0, + "loss": 0.3896, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.353515625, + "rewards/margins": 5.484375, + "rewards/rejected": -3.125, + "step": 657 + }, + { + "epoch": 0.1305620318468178, + "grad_norm": 37.5879702411973, + "learning_rate": 9.974758648274558e-07, + "logits/chosen": 4.05078125, + "logits/rejected": 4.1484375, + "logps/chosen": -917.5, + "logps/rejected": -618.0, + "loss": 0.5571, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.81011962890625, + "rewards/margins": 4.40234375, + "rewards/rejected": -2.591796875, + "step": 658 + }, + { + "epoch": 0.13076045438761844, + "grad_norm": 44.335211203497224, + "learning_rate": 9.974427931567084e-07, + "logits/chosen": 3.984375, + "logits/rejected": 4.3671875, + "logps/chosen": -751.0, + "logps/rejected": -598.5, + "loss": 0.5931, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.9892578125, + "rewards/margins": 4.25, + "rewards/rejected": -3.25390625, + "step": 659 + }, + { + "epoch": 0.13095887692841907, + "grad_norm": 39.05492635899299, + "learning_rate": 9.974095068556008e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 3.8046875, + "logps/chosen": -964.0, + "logps/rejected": -731.0, + "loss": 0.4773, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.578125, + "rewards/margins": 5.03125, + "rewards/rejected": -3.44921875, + "step": 660 + }, + { + "epoch": 0.1311572994692197, + "grad_norm": 43.670838169781234, + "learning_rate": 9.973760059401003e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.4296875, + "logps/chosen": -1264.0, + "logps/rejected": -615.0, + "loss": 0.3391, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.92578125, + "rewards/margins": 5.4296875, + "rewards/rejected": -3.49609375, + "step": 661 + }, + { + "epoch": 0.13135572201002033, + "grad_norm": 40.41089740163679, + "learning_rate": 9.973422904262768e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 4.09375, + "logps/chosen": -1086.0, + "logps/rejected": -684.0, + "loss": 0.41, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.787109375, + "rewards/margins": 5.177734375, + "rewards/rejected": -3.392578125, + "step": 662 + }, + { + "epoch": 0.13155414455082098, + "grad_norm": 43.24290503498707, + "learning_rate": 9.973083603303028e-07, + "logits/chosen": 4.125, + "logits/rejected": 3.75, + "logps/chosen": -795.0, + "logps/rejected": -446.0, + "loss": 0.4528, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.33056640625, + "rewards/margins": 3.8125, + "rewards/rejected": -2.482421875, + "step": 663 + }, + { + "epoch": 0.13175256709162161, + "grad_norm": 41.52953166513119, + "learning_rate": 9.97274215668454e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 4.04296875, + "logps/chosen": -1264.0, + "logps/rejected": -704.0, + "loss": 0.507, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.3447265625, + "rewards/margins": 4.5859375, + "rewards/rejected": -3.2421875, + "step": 664 + }, + { + "epoch": 0.13195098963242224, + "grad_norm": 41.131115933645326, + "learning_rate": 9.972398564571089e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.515625, + "logps/chosen": -980.0, + "logps/rejected": -657.5, + "loss": 0.5476, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.72265625, + "rewards/margins": 3.84375, + "rewards/rejected": -2.12109375, + "step": 665 + }, + { + "epoch": 0.13214941217322287, + "grad_norm": 39.70377206512711, + "learning_rate": 9.972052827127493e-07, + "logits/chosen": 3.6796875, + "logits/rejected": 3.85546875, + "logps/chosen": -1104.0, + "logps/rejected": -514.5, + "loss": 0.5447, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.073486328125, + "rewards/margins": 3.748046875, + "rewards/rejected": -2.67578125, + "step": 666 + }, + { + "epoch": 0.1323478347140235, + "grad_norm": 36.574944605681694, + "learning_rate": 9.971704944519593e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.96484375, + "logps/chosen": -976.0, + "logps/rejected": -802.0, + "loss": 0.4902, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.123046875, + "rewards/margins": 4.59765625, + "rewards/rejected": -2.478515625, + "step": 667 + }, + { + "epoch": 0.13254625725482413, + "grad_norm": 44.88366065445168, + "learning_rate": 9.971354916914263e-07, + "logits/chosen": 3.87109375, + "logits/rejected": 4.22265625, + "logps/chosen": -1085.0, + "logps/rejected": -888.0, + "loss": 0.612, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.20703125, + "rewards/margins": 3.4306640625, + "rewards/rejected": -2.216796875, + "step": 668 + }, + { + "epoch": 0.1327446797956248, + "grad_norm": 48.88640047694307, + "learning_rate": 9.971002744479403e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.26171875, + "logps/chosen": -1017.0, + "logps/rejected": -542.5, + "loss": 0.4734, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.29296875, + "rewards/margins": 3.84765625, + "rewards/rejected": -2.5625, + "step": 669 + }, + { + "epoch": 0.13294310233642542, + "grad_norm": 57.92913798090936, + "learning_rate": 9.97064842738395e-07, + "logits/chosen": 3.90625, + "logits/rejected": 4.0234375, + "logps/chosen": -1041.0, + "logps/rejected": -1521.5, + "loss": 0.4682, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.279296875, + "rewards/margins": 5.23046875, + "rewards/rejected": -3.958984375, + "step": 670 + }, + { + "epoch": 0.13314152487722605, + "grad_norm": 44.37365643185864, + "learning_rate": 9.970291965797858e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 4.07421875, + "logps/chosen": -723.5, + "logps/rejected": -993.0, + "loss": 0.6498, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.84027099609375, + "rewards/margins": 3.654296875, + "rewards/rejected": -2.8125, + "step": 671 + }, + { + "epoch": 0.13333994741802668, + "grad_norm": 43.803354082535215, + "learning_rate": 9.969933359892115e-07, + "logits/chosen": 3.73046875, + "logits/rejected": 3.8671875, + "logps/chosen": -895.0, + "logps/rejected": -530.5, + "loss": 0.4639, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.3330078125, + "rewards/margins": 4.4453125, + "rewards/rejected": -3.109375, + "step": 672 + }, + { + "epoch": 0.1335383699588273, + "grad_norm": 47.70733498326059, + "learning_rate": 9.969572609838744e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 3.94921875, + "logps/chosen": -1174.0, + "logps/rejected": -834.0, + "loss": 0.4063, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.53515625, + "rewards/margins": 4.95703125, + "rewards/rejected": -3.41796875, + "step": 673 + }, + { + "epoch": 0.13373679249962797, + "grad_norm": 42.61345519202839, + "learning_rate": 9.969209715810785e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.10546875, + "logps/chosen": -1040.0, + "logps/rejected": -695.0, + "loss": 0.5641, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.8701171875, + "rewards/margins": 3.7158203125, + "rewards/rejected": -2.845703125, + "step": 674 + }, + { + "epoch": 0.1339352150404286, + "grad_norm": 52.18706741414723, + "learning_rate": 9.968844677982312e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.44921875, + "logps/chosen": -1034.0, + "logps/rejected": -789.0, + "loss": 0.4862, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.4033203125, + "rewards/margins": 4.69921875, + "rewards/rejected": -3.2890625, + "step": 675 + }, + { + "epoch": 0.13413363758122923, + "grad_norm": 39.5425152445898, + "learning_rate": 9.968477496528426e-07, + "logits/chosen": 3.73828125, + "logits/rejected": 3.72265625, + "logps/chosen": -1310.0, + "logps/rejected": -928.0, + "loss": 0.459, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.67578125, + "rewards/margins": 5.1796875, + "rewards/rejected": -3.5, + "step": 676 + }, + { + "epoch": 0.13433206012202986, + "grad_norm": 40.35844095479555, + "learning_rate": 9.968108171625264e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 4.2421875, + "logps/chosen": -729.5, + "logps/rejected": -631.5, + "loss": 0.4555, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.2783203125, + "rewards/margins": 4.76953125, + "rewards/rejected": -3.4921875, + "step": 677 + }, + { + "epoch": 0.1345304826628305, + "grad_norm": 42.60425745488453, + "learning_rate": 9.96773670344998e-07, + "logits/chosen": 4.46875, + "logits/rejected": 4.62890625, + "logps/chosen": -972.0, + "logps/rejected": -810.0, + "loss": 0.502, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.5390625, + "rewards/margins": 4.7578125, + "rewards/rejected": -3.21875, + "step": 678 + }, + { + "epoch": 0.13472890520363112, + "grad_norm": 37.888584580118895, + "learning_rate": 9.967363092180759e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 3.97265625, + "logps/chosen": -810.0, + "logps/rejected": -600.0, + "loss": 0.5189, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2841796875, + "rewards/margins": 4.6875, + "rewards/rejected": -3.40234375, + "step": 679 + }, + { + "epoch": 0.13492732774443178, + "grad_norm": 35.46020116345346, + "learning_rate": 9.966987337996818e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.30859375, + "logps/chosen": -1289.0, + "logps/rejected": -736.0, + "loss": 0.3085, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.703125, + "rewards/margins": 6.0859375, + "rewards/rejected": -3.390625, + "step": 680 + }, + { + "epoch": 0.1351257502852324, + "grad_norm": 44.4352720867265, + "learning_rate": 9.9666094410784e-07, + "logits/chosen": 3.68359375, + "logits/rejected": 3.9453125, + "logps/chosen": -984.0, + "logps/rejected": -865.0, + "loss": 0.569, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.431640625, + "rewards/margins": 4.40234375, + "rewards/rejected": -2.9658203125, + "step": 681 + }, + { + "epoch": 0.13532417282603304, + "grad_norm": 37.83866683436481, + "learning_rate": 9.966229401606774e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.32421875, + "logps/chosen": -921.5, + "logps/rejected": -951.0, + "loss": 0.4469, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.791015625, + "rewards/margins": 5.609375, + "rewards/rejected": -3.8125, + "step": 682 + }, + { + "epoch": 0.13552259536683367, + "grad_norm": 47.73610368603832, + "learning_rate": 9.965847219764239e-07, + "logits/chosen": 3.3671875, + "logits/rejected": 3.609375, + "logps/chosen": -752.0, + "logps/rejected": -550.5, + "loss": 0.6644, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.57373046875, + "rewards/margins": 2.732421875, + "rewards/rejected": -2.153076171875, + "step": 683 + }, + { + "epoch": 0.1357210179076343, + "grad_norm": 43.82966812884743, + "learning_rate": 9.96546289573412e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 4.0, + "logps/chosen": -1082.0, + "logps/rejected": -852.0, + "loss": 0.3457, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.556640625, + "rewards/margins": 6.5234375, + "rewards/rejected": -4.95703125, + "step": 684 + }, + { + "epoch": 0.13591944044843493, + "grad_norm": 35.654323559586494, + "learning_rate": 9.965076429700773e-07, + "logits/chosen": 4.25, + "logits/rejected": 4.34375, + "logps/chosen": -1188.0, + "logps/rejected": -900.0, + "loss": 0.4407, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.755859375, + "rewards/margins": 4.8984375, + "rewards/rejected": -3.150390625, + "step": 685 + }, + { + "epoch": 0.1361178629892356, + "grad_norm": 36.55531894852696, + "learning_rate": 9.964687821849576e-07, + "logits/chosen": 3.41015625, + "logits/rejected": 3.796875, + "logps/chosen": -1066.0, + "logps/rejected": -634.0, + "loss": 0.497, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.932403564453125, + "rewards/margins": 4.1171875, + "rewards/rejected": -3.1875, + "step": 686 + }, + { + "epoch": 0.13631628553003622, + "grad_norm": 42.234318735686315, + "learning_rate": 9.96429707236694e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.921875, + "logps/chosen": -898.0, + "logps/rejected": -574.0, + "loss": 0.5521, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.93408203125, + "rewards/margins": 3.89453125, + "rewards/rejected": -2.96484375, + "step": 687 + }, + { + "epoch": 0.13651470807083685, + "grad_norm": 42.16838771112956, + "learning_rate": 9.963904181440298e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 4.20703125, + "logps/chosen": -1221.0, + "logps/rejected": -681.0, + "loss": 0.3927, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.787109375, + "rewards/margins": 5.20703125, + "rewards/rejected": -3.421875, + "step": 688 + }, + { + "epoch": 0.13671313061163748, + "grad_norm": 39.03056457831899, + "learning_rate": 9.963509149258114e-07, + "logits/chosen": 3.67578125, + "logits/rejected": 3.71875, + "logps/chosen": -827.0, + "logps/rejected": -1154.0, + "loss": 0.4156, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.451171875, + "rewards/margins": 5.7890625, + "rewards/rejected": -4.33984375, + "step": 689 + }, + { + "epoch": 0.1369115531524381, + "grad_norm": 45.30039184558278, + "learning_rate": 9.963111976009876e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 3.75390625, + "logps/chosen": -741.5, + "logps/rejected": -814.0, + "loss": 0.5088, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.45654296875, + "rewards/margins": 9.6484375, + "rewards/rejected": -9.1875, + "step": 690 + }, + { + "epoch": 0.13710997569323874, + "grad_norm": 39.12585212549323, + "learning_rate": 9.962712661886101e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.2109375, + "logps/chosen": -940.0, + "logps/rejected": -623.0, + "loss": 0.6016, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.541015625, + "rewards/margins": 3.73046875, + "rewards/rejected": -3.19140625, + "step": 691 + }, + { + "epoch": 0.1373083982340394, + "grad_norm": 42.38833643176591, + "learning_rate": 9.96231120707834e-07, + "logits/chosen": 4.15625, + "logits/rejected": 3.9609375, + "logps/chosen": -1185.0, + "logps/rejected": -814.5, + "loss": 0.5196, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.73046875, + "rewards/margins": 4.140625, + "rewards/rejected": -3.4140625, + "step": 692 + }, + { + "epoch": 0.13750682077484003, + "grad_norm": 36.92122873242455, + "learning_rate": 9.961907611779154e-07, + "logits/chosen": 3.37109375, + "logits/rejected": 3.5390625, + "logps/chosen": -1154.0, + "logps/rejected": -923.0, + "loss": 0.3999, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.6318359375, + "rewards/margins": 5.33203125, + "rewards/rejected": -3.69921875, + "step": 693 + }, + { + "epoch": 0.13770524331564066, + "grad_norm": 44.38032148704965, + "learning_rate": 9.961501876182148e-07, + "logits/chosen": 3.63671875, + "logits/rejected": 3.51171875, + "logps/chosen": -980.0, + "logps/rejected": -573.0, + "loss": 0.5487, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.716796875, + "rewards/margins": 4.09375, + "rewards/rejected": -2.375, + "step": 694 + }, + { + "epoch": 0.1379036658564413, + "grad_norm": 35.076601723285386, + "learning_rate": 9.96109400048194e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.97265625, + "logps/chosen": -782.5, + "logps/rejected": -736.0, + "loss": 0.5578, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.39111328125, + "rewards/margins": 4.515625, + "rewards/rejected": -3.125, + "step": 695 + }, + { + "epoch": 0.13810208839724192, + "grad_norm": 43.83987172452935, + "learning_rate": 9.960683984874183e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 4.0703125, + "logps/chosen": -1129.0, + "logps/rejected": -683.5, + "loss": 0.4945, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.419921875, + "rewards/margins": 4.53515625, + "rewards/rejected": -3.111328125, + "step": 696 + }, + { + "epoch": 0.13830051093804255, + "grad_norm": 40.03847998178344, + "learning_rate": 9.960271829555557e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 3.8671875, + "logps/chosen": -870.0, + "logps/rejected": -696.0, + "loss": 0.494, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.4013671875, + "rewards/margins": 4.6796875, + "rewards/rejected": -3.28515625, + "step": 697 + }, + { + "epoch": 0.1384989334788432, + "grad_norm": 49.58263518422443, + "learning_rate": 9.959857534723763e-07, + "logits/chosen": 4.0, + "logits/rejected": 4.18359375, + "logps/chosen": -1225.0, + "logps/rejected": -905.0, + "loss": 0.4003, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.75, + "rewards/margins": 6.1875, + "rewards/rejected": -4.4453125, + "step": 698 + }, + { + "epoch": 0.13869735601964384, + "grad_norm": 37.6339276493109, + "learning_rate": 9.95944110057753e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 3.7265625, + "logps/chosen": -1037.0, + "logps/rejected": -697.0, + "loss": 0.5487, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.5537109375, + "rewards/margins": 3.666015625, + "rewards/rejected": -2.119140625, + "step": 699 + }, + { + "epoch": 0.13889577856044447, + "grad_norm": 38.885187436770956, + "learning_rate": 9.959022527316617e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.3203125, + "logps/chosen": -756.0, + "logps/rejected": -612.0, + "loss": 0.5998, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.0201416015625, + "rewards/margins": 3.736328125, + "rewards/rejected": -2.7158203125, + "step": 700 + }, + { + "epoch": 0.1390942011012451, + "grad_norm": 48.02202607195783, + "learning_rate": 9.958601815141803e-07, + "logits/chosen": 4.46875, + "logits/rejected": 4.30859375, + "logps/chosen": -966.0, + "logps/rejected": -640.5, + "loss": 0.6163, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.72705078125, + "rewards/margins": 3.474609375, + "rewards/rejected": -2.75, + "step": 701 + }, + { + "epoch": 0.13929262364204573, + "grad_norm": 38.410171517926244, + "learning_rate": 9.958178964254898e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.375, + "logps/chosen": -840.0, + "logps/rejected": -1342.0, + "loss": 0.5751, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.537109375, + "rewards/margins": 4.591796875, + "rewards/rejected": -3.05517578125, + "step": 702 + }, + { + "epoch": 0.13949104618284638, + "grad_norm": 38.02523485691479, + "learning_rate": 9.957753974858736e-07, + "logits/chosen": 4.4921875, + "logits/rejected": 4.625, + "logps/chosen": -946.0, + "logps/rejected": -829.0, + "loss": 0.5265, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.7265625, + "rewards/margins": 4.61328125, + "rewards/rejected": -2.8828125, + "step": 703 + }, + { + "epoch": 0.13968946872364701, + "grad_norm": 40.519300920659425, + "learning_rate": 9.957326847157177e-07, + "logits/chosen": 3.7421875, + "logits/rejected": 3.70703125, + "logps/chosen": -702.0, + "logps/rejected": -546.0, + "loss": 0.4658, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.708984375, + "rewards/margins": 3.8515625, + "rewards/rejected": -2.130859375, + "step": 704 + }, + { + "epoch": 0.13988789126444764, + "grad_norm": 38.791353145309884, + "learning_rate": 9.956897581355106e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.20703125, + "logps/chosen": -1092.0, + "logps/rejected": -739.0, + "loss": 0.2869, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2041015625, + "rewards/margins": 6.15234375, + "rewards/rejected": -3.9375, + "step": 705 + }, + { + "epoch": 0.14008631380524827, + "grad_norm": 39.20932478830196, + "learning_rate": 9.956466177658434e-07, + "logits/chosen": 3.66015625, + "logits/rejected": 3.76953125, + "logps/chosen": -1205.0, + "logps/rejected": -847.5, + "loss": 0.3954, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.5048828125, + "rewards/margins": 5.3203125, + "rewards/rejected": -3.80859375, + "step": 706 + }, + { + "epoch": 0.1402847363460489, + "grad_norm": 53.48474950391239, + "learning_rate": 9.9560326362741e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 4.31640625, + "logps/chosen": -992.0, + "logps/rejected": -1045.0, + "loss": 0.5402, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.51025390625, + "rewards/margins": 4.765625, + "rewards/rejected": -4.25, + "step": 707 + }, + { + "epoch": 0.14048315888684954, + "grad_norm": 39.21900034578989, + "learning_rate": 9.95559695741006e-07, + "logits/chosen": 3.6796875, + "logits/rejected": 3.9765625, + "logps/chosen": -603.5, + "logps/rejected": -736.0, + "loss": 0.5993, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.6134033203125, + "rewards/margins": 3.5625, + "rewards/rejected": -2.94140625, + "step": 708 + }, + { + "epoch": 0.1406815814276502, + "grad_norm": 37.461645535805864, + "learning_rate": 9.955159141275307e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.32421875, + "logps/chosen": -904.0, + "logps/rejected": -544.0, + "loss": 0.3511, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.61328125, + "rewards/margins": 5.125, + "rewards/rejected": -3.50390625, + "step": 709 + }, + { + "epoch": 0.14088000396845082, + "grad_norm": 31.81864861141007, + "learning_rate": 9.954719188079854e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.0234375, + "logps/chosen": -1245.0, + "logps/rejected": -813.0, + "loss": 0.503, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.669921875, + "rewards/margins": 5.375, + "rewards/rejected": -2.716796875, + "step": 710 + }, + { + "epoch": 0.14107842650925145, + "grad_norm": 36.99929134589748, + "learning_rate": 9.954277098034735e-07, + "logits/chosen": 3.72265625, + "logits/rejected": 3.578125, + "logps/chosen": -950.0, + "logps/rejected": -659.5, + "loss": 0.4791, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.921875, + "rewards/margins": 4.859375, + "rewards/rejected": -3.94140625, + "step": 711 + }, + { + "epoch": 0.14127684905005208, + "grad_norm": 40.10338205674471, + "learning_rate": 9.953832871352018e-07, + "logits/chosen": 3.79296875, + "logits/rejected": 3.74609375, + "logps/chosen": -1124.0, + "logps/rejected": -640.0, + "loss": 0.3388, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.529296875, + "rewards/margins": 4.953125, + "rewards/rejected": -3.41796875, + "step": 712 + }, + { + "epoch": 0.1414752715908527, + "grad_norm": 39.78817988066468, + "learning_rate": 9.953386508244784e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 4.1015625, + "logps/chosen": -863.0, + "logps/rejected": -663.0, + "loss": 0.5792, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7481689453125, + "rewards/margins": 4.537109375, + "rewards/rejected": -3.791015625, + "step": 713 + }, + { + "epoch": 0.14167369413165334, + "grad_norm": 42.77595075144863, + "learning_rate": 9.95293800892715e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.87890625, + "logps/chosen": -673.0, + "logps/rejected": -1421.0, + "loss": 0.5537, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.6962890625, + "rewards/margins": 6.296875, + "rewards/rejected": -5.5859375, + "step": 714 + }, + { + "epoch": 0.141872116672454, + "grad_norm": 38.14887880371411, + "learning_rate": 9.952487373614248e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.87890625, + "logps/chosen": -861.0, + "logps/rejected": -657.0, + "loss": 0.49, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.26708984375, + "rewards/margins": 4.08203125, + "rewards/rejected": -2.80859375, + "step": 715 + }, + { + "epoch": 0.14207053921325463, + "grad_norm": 43.47383872652049, + "learning_rate": 9.952034602522249e-07, + "logits/chosen": 3.96875, + "logits/rejected": 4.27734375, + "logps/chosen": -777.5, + "logps/rejected": -664.5, + "loss": 0.6939, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.8035888671875, + "rewards/margins": 3.1025390625, + "rewards/rejected": -2.29931640625, + "step": 716 + }, + { + "epoch": 0.14226896175405526, + "grad_norm": 39.756794687222495, + "learning_rate": 9.951579695868332e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 3.99609375, + "logps/chosen": -991.0, + "logps/rejected": -569.0, + "loss": 0.5657, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.310546875, + "rewards/margins": 3.92578125, + "rewards/rejected": -3.609375, + "step": 717 + }, + { + "epoch": 0.1424673842948559, + "grad_norm": 39.57726691702992, + "learning_rate": 9.95112265387071e-07, + "logits/chosen": 3.796875, + "logits/rejected": 3.59765625, + "logps/chosen": -931.0, + "logps/rejected": -506.0, + "loss": 0.5002, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.931640625, + "rewards/margins": 3.71875, + "rewards/rejected": -2.783203125, + "step": 718 + }, + { + "epoch": 0.14266580683565652, + "grad_norm": 62.55561941941927, + "learning_rate": 9.950663476748614e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.3515625, + "logps/chosen": -1102.0, + "logps/rejected": -830.0, + "loss": 0.5708, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.359375, + "rewards/margins": 3.78515625, + "rewards/rejected": -2.427734375, + "step": 719 + }, + { + "epoch": 0.14286422937645715, + "grad_norm": 38.819290717862906, + "learning_rate": 9.95020216472231e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 4.125, + "logps/chosen": -926.0, + "logps/rejected": -803.5, + "loss": 0.4526, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.80859375, + "rewards/margins": 4.63671875, + "rewards/rejected": -2.826171875, + "step": 720 + }, + { + "epoch": 0.1430626519172578, + "grad_norm": 35.78478111523616, + "learning_rate": 9.949738718013078e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.21484375, + "logps/chosen": -642.5, + "logps/rejected": -1297.5, + "loss": 0.6106, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.49072265625, + "rewards/margins": 4.359375, + "rewards/rejected": -3.859375, + "step": 721 + }, + { + "epoch": 0.14326107445805844, + "grad_norm": 48.975521622017084, + "learning_rate": 9.949273136843224e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 3.90234375, + "logps/chosen": -1030.0, + "logps/rejected": -752.0, + "loss": 0.4883, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.0467529296875, + "rewards/margins": 4.3203125, + "rewards/rejected": -3.27734375, + "step": 722 + }, + { + "epoch": 0.14345949699885907, + "grad_norm": 41.76057028184975, + "learning_rate": 9.94880542143608e-07, + "logits/chosen": 3.890625, + "logits/rejected": 4.0546875, + "logps/chosen": -933.0, + "logps/rejected": -642.0, + "loss": 0.524, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8115234375, + "rewards/margins": 4.2890625, + "rewards/rejected": -2.482421875, + "step": 723 + }, + { + "epoch": 0.1436579195396597, + "grad_norm": 45.01790981020067, + "learning_rate": 9.948335572016e-07, + "logits/chosen": 3.6953125, + "logits/rejected": 3.625, + "logps/chosen": -1011.5, + "logps/rejected": -750.0, + "loss": 0.5527, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.081787109375, + "rewards/margins": 3.818359375, + "rewards/rejected": -2.732421875, + "step": 724 + }, + { + "epoch": 0.14385634208046033, + "grad_norm": 33.23930742363142, + "learning_rate": 9.947863588808364e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.5625, + "logps/chosen": -737.0, + "logps/rejected": -810.5, + "loss": 0.3748, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.978515625, + "rewards/margins": 5.7421875, + "rewards/rejected": -3.771484375, + "step": 725 + }, + { + "epoch": 0.144054764621261, + "grad_norm": 32.104480566260875, + "learning_rate": 9.947389472039574e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 4.06640625, + "logps/chosen": -1012.0, + "logps/rejected": -716.0, + "loss": 0.488, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.28125, + "rewards/margins": 5.484375, + "rewards/rejected": -3.19921875, + "step": 726 + }, + { + "epoch": 0.14425318716206162, + "grad_norm": 57.586517104169765, + "learning_rate": 9.946913221937053e-07, + "logits/chosen": 3.83984375, + "logits/rejected": 3.7421875, + "logps/chosen": -1168.0, + "logps/rejected": -617.0, + "loss": 0.5638, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.40234375, + "rewards/margins": 3.92578125, + "rewards/rejected": -2.525390625, + "step": 727 + }, + { + "epoch": 0.14445160970286225, + "grad_norm": 40.210804636905245, + "learning_rate": 9.94643483872925e-07, + "logits/chosen": 3.66015625, + "logits/rejected": 3.5703125, + "logps/chosen": -988.0, + "logps/rejected": -693.0, + "loss": 0.5485, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.505859375, + "rewards/margins": 4.50390625, + "rewards/rejected": -2.994140625, + "step": 728 + }, + { + "epoch": 0.14465003224366288, + "grad_norm": 65.22996233480477, + "learning_rate": 9.94595432264564e-07, + "logits/chosen": 3.5703125, + "logits/rejected": 3.7265625, + "logps/chosen": -1427.0, + "logps/rejected": -1477.0, + "loss": 0.5378, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.9658203125, + "rewards/margins": 6.0703125, + "rewards/rejected": -5.11328125, + "step": 729 + }, + { + "epoch": 0.1448484547844635, + "grad_norm": 38.94174385799742, + "learning_rate": 9.945471673916715e-07, + "logits/chosen": 3.7265625, + "logits/rejected": 3.97265625, + "logps/chosen": -955.0, + "logps/rejected": -527.0, + "loss": 0.5405, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.1259765625, + "rewards/margins": 3.875, + "rewards/rejected": -2.7421875, + "step": 730 + }, + { + "epoch": 0.14504687732526414, + "grad_norm": 40.67207086593549, + "learning_rate": 9.944986892773995e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.96875, + "logps/chosen": -1020.0, + "logps/rejected": -600.0, + "loss": 0.5972, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.9052734375, + "rewards/margins": 3.55859375, + "rewards/rejected": -2.66015625, + "step": 731 + }, + { + "epoch": 0.1452452998660648, + "grad_norm": 41.956432324828754, + "learning_rate": 9.94449997945002e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.9296875, + "logps/chosen": -1226.0, + "logps/rejected": -1018.0, + "loss": 0.4188, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.59375, + "rewards/margins": 6.2421875, + "rewards/rejected": -4.65625, + "step": 732 + }, + { + "epoch": 0.14544372240686543, + "grad_norm": 40.19335017916601, + "learning_rate": 9.944010934178352e-07, + "logits/chosen": 4.52734375, + "logits/rejected": 4.40234375, + "logps/chosen": -1091.0, + "logps/rejected": -726.5, + "loss": 0.5058, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.310546875, + "rewards/margins": 4.3515625, + "rewards/rejected": -2.046875, + "step": 733 + }, + { + "epoch": 0.14564214494766606, + "grad_norm": 38.20413046703415, + "learning_rate": 9.943519757193583e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.76171875, + "logps/chosen": -1011.0, + "logps/rejected": -655.0, + "loss": 0.5782, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.595703125, + "rewards/margins": 3.37109375, + "rewards/rejected": -1.7763671875, + "step": 734 + }, + { + "epoch": 0.1458405674884667, + "grad_norm": 46.70953797287438, + "learning_rate": 9.943026448731315e-07, + "logits/chosen": 3.8046875, + "logits/rejected": 3.76171875, + "logps/chosen": -1273.0, + "logps/rejected": -683.0, + "loss": 0.4034, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.353515625, + "rewards/margins": 5.3671875, + "rewards/rejected": -3.01953125, + "step": 735 + }, + { + "epoch": 0.14603899002926732, + "grad_norm": 43.628707256016384, + "learning_rate": 9.942531009028182e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.59375, + "logps/chosen": -1062.0, + "logps/rejected": -1476.0, + "loss": 0.4705, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.521484375, + "rewards/margins": 6.0625, + "rewards/rejected": -4.55078125, + "step": 736 + }, + { + "epoch": 0.14623741257006795, + "grad_norm": 40.720245129865404, + "learning_rate": 9.94203343832184e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.99609375, + "logps/chosen": -816.0, + "logps/rejected": -768.0, + "loss": 0.4654, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.1708984375, + "rewards/margins": 4.4296875, + "rewards/rejected": -3.265625, + "step": 737 + }, + { + "epoch": 0.1464358351108686, + "grad_norm": 45.54962189345215, + "learning_rate": 9.94153373685096e-07, + "logits/chosen": 4.40234375, + "logits/rejected": 4.625, + "logps/chosen": -866.0, + "logps/rejected": -683.75, + "loss": 0.5153, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.033935546875, + "rewards/margins": 4.96337890625, + "rewards/rejected": -2.921875, + "step": 738 + }, + { + "epoch": 0.14663425765166924, + "grad_norm": 36.394816627232494, + "learning_rate": 9.941031904855244e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.5625, + "logps/chosen": -902.0, + "logps/rejected": -1466.0, + "loss": 0.4237, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.83203125, + "rewards/margins": 6.7734375, + "rewards/rejected": -4.93359375, + "step": 739 + }, + { + "epoch": 0.14683268019246987, + "grad_norm": 43.90026277675387, + "learning_rate": 9.94052794257541e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.47265625, + "logps/chosen": -1064.0, + "logps/rejected": -1063.0, + "loss": 0.5633, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.416015625, + "rewards/margins": 5.1015625, + "rewards/rejected": -3.6875, + "step": 740 + }, + { + "epoch": 0.1470311027332705, + "grad_norm": 36.967356149094584, + "learning_rate": 9.940021850253203e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.05078125, + "logps/chosen": -1014.0, + "logps/rejected": -812.0, + "loss": 0.4882, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.4521484375, + "rewards/margins": 4.5, + "rewards/rejected": -3.04296875, + "step": 741 + }, + { + "epoch": 0.14722952527407113, + "grad_norm": 36.63247847021023, + "learning_rate": 9.939513628131382e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 3.859375, + "logps/chosen": -637.0, + "logps/rejected": -1184.5, + "loss": 0.5899, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.98046875, + "rewards/margins": 3.9453125, + "rewards/rejected": -2.96484375, + "step": 742 + }, + { + "epoch": 0.14742794781487176, + "grad_norm": 41.088252039442274, + "learning_rate": 9.939003276453732e-07, + "logits/chosen": 4.35546875, + "logits/rejected": 4.3046875, + "logps/chosen": -1250.0, + "logps/rejected": -962.0, + "loss": 0.3639, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.98046875, + "rewards/margins": 6.7109375, + "rewards/rejected": -4.73046875, + "step": 743 + }, + { + "epoch": 0.14762637035567241, + "grad_norm": 39.40803976382112, + "learning_rate": 9.938490795465063e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.453125, + "logps/chosen": -1012.0, + "logps/rejected": -971.0, + "loss": 0.5002, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.7626953125, + "rewards/margins": 5.7265625, + "rewards/rejected": -3.96484375, + "step": 744 + }, + { + "epoch": 0.14782479289647305, + "grad_norm": 50.20722423702584, + "learning_rate": 9.937976185411202e-07, + "logits/chosen": 3.76953125, + "logits/rejected": 3.98046875, + "logps/chosen": -1068.0, + "logps/rejected": -755.0, + "loss": 0.6008, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.0078125, + "rewards/margins": 4.03515625, + "rewards/rejected": -3.0234375, + "step": 745 + }, + { + "epoch": 0.14802321543727368, + "grad_norm": 33.455374986584, + "learning_rate": 9.937459446538996e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 3.97265625, + "logps/chosen": -907.0, + "logps/rejected": -518.0, + "loss": 0.5195, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.06396484375, + "rewards/margins": 3.48046875, + "rewards/rejected": -2.41796875, + "step": 746 + }, + { + "epoch": 0.1482216379780743, + "grad_norm": 43.585584829183574, + "learning_rate": 9.93694057909632e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 4.01953125, + "logps/chosen": -888.0, + "logps/rejected": -597.0, + "loss": 0.5304, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.0947265625, + "rewards/margins": 4.6171875, + "rewards/rejected": -3.51953125, + "step": 747 + }, + { + "epoch": 0.14842006051887494, + "grad_norm": 39.538889015284546, + "learning_rate": 9.93641958333206e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.21484375, + "logps/chosen": -773.0, + "logps/rejected": -471.0, + "loss": 0.5422, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.91131591796875, + "rewards/margins": 3.76953125, + "rewards/rejected": -2.86328125, + "step": 748 + }, + { + "epoch": 0.14861848305967557, + "grad_norm": 43.38532261749125, + "learning_rate": 9.935896459496134e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 3.83984375, + "logps/chosen": -935.0, + "logps/rejected": -680.5, + "loss": 0.4751, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.904296875, + "rewards/margins": 4.59375, + "rewards/rejected": -2.685546875, + "step": 749 + }, + { + "epoch": 0.14881690560047622, + "grad_norm": 44.67280434283344, + "learning_rate": 9.93537120783947e-07, + "logits/chosen": 3.58203125, + "logits/rejected": 3.71484375, + "logps/chosen": -1075.0, + "logps/rejected": -851.0, + "loss": 0.5279, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.259765625, + "rewards/margins": 3.9677734375, + "rewards/rejected": -2.70849609375, + "step": 750 + }, + { + "epoch": 0.14901532814127685, + "grad_norm": 43.134015215490656, + "learning_rate": 9.934843828614026e-07, + "logits/chosen": 3.6484375, + "logits/rejected": 3.609375, + "logps/chosen": -1303.0, + "logps/rejected": -871.0, + "loss": 0.4897, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.9615478515625, + "rewards/margins": 4.95703125, + "rewards/rejected": -4.0, + "step": 751 + }, + { + "epoch": 0.14921375068207748, + "grad_norm": 44.239609699812604, + "learning_rate": 9.934314322072774e-07, + "logits/chosen": 3.6953125, + "logits/rejected": 3.6953125, + "logps/chosen": -954.0, + "logps/rejected": -693.0, + "loss": 0.5112, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.3134765625, + "rewards/margins": 4.3828125, + "rewards/rejected": -3.06640625, + "step": 752 + }, + { + "epoch": 0.14941217322287811, + "grad_norm": 42.13238655624287, + "learning_rate": 9.933782688469711e-07, + "logits/chosen": 3.74609375, + "logits/rejected": 4.1484375, + "logps/chosen": -957.0, + "logps/rejected": -729.0, + "loss": 0.4989, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.950439453125, + "rewards/margins": 4.859375, + "rewards/rejected": -3.91015625, + "step": 753 + }, + { + "epoch": 0.14961059576367874, + "grad_norm": 41.265581801173305, + "learning_rate": 9.93324892805985e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.359375, + "logps/chosen": -1099.0, + "logps/rejected": -865.0, + "loss": 0.3542, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.6904296875, + "rewards/margins": 6.0703125, + "rewards/rejected": -4.375, + "step": 754 + }, + { + "epoch": 0.1498090183044794, + "grad_norm": 44.64809677907114, + "learning_rate": 9.932713041099226e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.31640625, + "logps/chosen": -1378.0, + "logps/rejected": -734.0, + "loss": 0.5468, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.686767578125, + "rewards/margins": 4.73046875, + "rewards/rejected": -3.044921875, + "step": 755 + }, + { + "epoch": 0.15000744084528003, + "grad_norm": 45.37686397289485, + "learning_rate": 9.932175027844896e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 3.9296875, + "logps/chosen": -1243.0, + "logps/rejected": -1193.0, + "loss": 0.5129, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.2646484375, + "rewards/margins": 5.390625, + "rewards/rejected": -4.130859375, + "step": 756 + }, + { + "epoch": 0.15020586338608066, + "grad_norm": 32.21421060997988, + "learning_rate": 9.931634888554935e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 4.45703125, + "logps/chosen": -894.0, + "logps/rejected": -770.0, + "loss": 0.5304, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.453125, + "rewards/margins": 4.67578125, + "rewards/rejected": -3.228515625, + "step": 757 + }, + { + "epoch": 0.1504042859268813, + "grad_norm": 35.32869384561756, + "learning_rate": 9.931092623488442e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.86328125, + "logps/chosen": -1108.0, + "logps/rejected": -710.0, + "loss": 0.4623, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.623046875, + "rewards/margins": 5.25, + "rewards/rejected": -3.62890625, + "step": 758 + }, + { + "epoch": 0.15060270846768192, + "grad_norm": 34.14133466434811, + "learning_rate": 9.930548232905524e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 3.6171875, + "logps/chosen": -927.0, + "logps/rejected": -617.0, + "loss": 0.4615, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.078125, + "rewards/margins": 4.58203125, + "rewards/rejected": -2.5009765625, + "step": 759 + }, + { + "epoch": 0.15080113100848255, + "grad_norm": 40.320846220531536, + "learning_rate": 9.93000171706732e-07, + "logits/chosen": 3.51953125, + "logits/rejected": 3.72265625, + "logps/chosen": -884.0, + "logps/rejected": -853.0, + "loss": 0.4751, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.361328125, + "rewards/margins": 4.4375, + "rewards/rejected": -3.078125, + "step": 760 + }, + { + "epoch": 0.1509995535492832, + "grad_norm": 45.08932920521483, + "learning_rate": 9.929453076235986e-07, + "logits/chosen": 3.70703125, + "logits/rejected": 3.6953125, + "logps/chosen": -667.0, + "logps/rejected": -803.0, + "loss": 0.6292, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.661376953125, + "rewards/margins": 3.251953125, + "rewards/rejected": -2.59375, + "step": 761 + }, + { + "epoch": 0.15119797609008384, + "grad_norm": 45.76400800669342, + "learning_rate": 9.92890231067469e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.16015625, + "logps/chosen": -1360.0, + "logps/rejected": -793.0, + "loss": 0.3804, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3046875, + "rewards/margins": 5.609375, + "rewards/rejected": -3.3125, + "step": 762 + }, + { + "epoch": 0.15139639863088447, + "grad_norm": 36.28395124053769, + "learning_rate": 9.928349420647628e-07, + "logits/chosen": 3.9375, + "logits/rejected": 4.09375, + "logps/chosen": -880.0, + "logps/rejected": -1245.0, + "loss": 0.4719, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.32080078125, + "rewards/margins": 6.1640625, + "rewards/rejected": -4.8359375, + "step": 763 + }, + { + "epoch": 0.1515948211716851, + "grad_norm": 40.339734135866365, + "learning_rate": 9.927794406420012e-07, + "logits/chosen": 4.0625, + "logits/rejected": 4.2890625, + "logps/chosen": -1071.0, + "logps/rejected": -1454.0, + "loss": 0.4657, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4150390625, + "rewards/margins": 6.37109375, + "rewards/rejected": -4.95703125, + "step": 764 + }, + { + "epoch": 0.15179324371248573, + "grad_norm": 38.60688701118083, + "learning_rate": 9.927237268258073e-07, + "logits/chosen": 4.359375, + "logits/rejected": 4.35546875, + "logps/chosen": -751.0, + "logps/rejected": -794.0, + "loss": 0.539, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.20361328125, + "rewards/margins": 4.078125, + "rewards/rejected": -2.869140625, + "step": 765 + }, + { + "epoch": 0.15199166625328636, + "grad_norm": 35.14725434967038, + "learning_rate": 9.926678006429055e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.2421875, + "logps/chosen": -1150.0, + "logps/rejected": -625.5, + "loss": 0.4601, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.5703125, + "rewards/margins": 4.515625, + "rewards/rejected": -2.94921875, + "step": 766 + }, + { + "epoch": 0.15219008879408702, + "grad_norm": 35.855606742463145, + "learning_rate": 9.926116621201232e-07, + "logits/chosen": 4.390625, + "logits/rejected": 4.32421875, + "logps/chosen": -860.0, + "logps/rejected": -586.5, + "loss": 0.4749, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.767578125, + "rewards/margins": 4.2421875, + "rewards/rejected": -2.470703125, + "step": 767 + }, + { + "epoch": 0.15238851133488765, + "grad_norm": 40.82499393853144, + "learning_rate": 9.925553112843888e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.0390625, + "logps/chosen": -1301.0, + "logps/rejected": -722.5, + "loss": 0.3503, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.765625, + "rewards/margins": 5.89453125, + "rewards/rejected": -4.1328125, + "step": 768 + }, + { + "epoch": 0.15258693387568828, + "grad_norm": 37.18090364601392, + "learning_rate": 9.924987481627327e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.3046875, + "logps/chosen": -831.0, + "logps/rejected": -586.0, + "loss": 0.5537, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.482421875, + "rewards/margins": 3.9140625, + "rewards/rejected": -2.4375, + "step": 769 + }, + { + "epoch": 0.1527853564164889, + "grad_norm": 48.804472176758644, + "learning_rate": 9.924419727822875e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.39453125, + "logps/chosen": -822.0, + "logps/rejected": -511.0, + "loss": 0.5266, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.923828125, + "rewards/margins": 4.30078125, + "rewards/rejected": -3.37890625, + "step": 770 + }, + { + "epoch": 0.15298377895728954, + "grad_norm": 37.991709238708424, + "learning_rate": 9.92384985170287e-07, + "logits/chosen": 3.69921875, + "logits/rejected": 3.59375, + "logps/chosen": -942.5, + "logps/rejected": -548.0, + "loss": 0.5803, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.2646484375, + "rewards/margins": 3.7578125, + "rewards/rejected": -2.490234375, + "step": 771 + }, + { + "epoch": 0.15318220149809017, + "grad_norm": 35.25000981535877, + "learning_rate": 9.923277853540673e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 3.96875, + "logps/chosen": -737.5, + "logps/rejected": -510.5, + "loss": 0.4502, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.5390625, + "rewards/margins": 4.29296875, + "rewards/rejected": -2.76171875, + "step": 772 + }, + { + "epoch": 0.15338062403889083, + "grad_norm": 44.53728692937205, + "learning_rate": 9.922703733610664e-07, + "logits/chosen": 3.7265625, + "logits/rejected": 3.91015625, + "logps/chosen": -641.5, + "logps/rejected": -704.0, + "loss": 0.596, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6708984375, + "rewards/margins": 4.6484375, + "rewards/rejected": -3.97265625, + "step": 773 + }, + { + "epoch": 0.15357904657969146, + "grad_norm": 43.21677375487113, + "learning_rate": 9.922127492188233e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.01953125, + "logps/chosen": -768.5, + "logps/rejected": -664.0, + "loss": 0.4356, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.4404296875, + "rewards/margins": 5.1171875, + "rewards/rejected": -3.6640625, + "step": 774 + }, + { + "epoch": 0.1537774691204921, + "grad_norm": 47.18306451454107, + "learning_rate": 9.921549129549798e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.02734375, + "logps/chosen": -1442.0, + "logps/rejected": -896.0, + "loss": 0.5013, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.5439453125, + "rewards/margins": 5.1171875, + "rewards/rejected": -3.5859375, + "step": 775 + }, + { + "epoch": 0.15397589166129272, + "grad_norm": 36.725318812075926, + "learning_rate": 9.920968645972784e-07, + "logits/chosen": 3.69140625, + "logits/rejected": 4.3359375, + "logps/chosen": -667.0, + "logps/rejected": -847.0, + "loss": 0.6927, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.09912109375, + "rewards/margins": 3.21484375, + "rewards/rejected": -2.11328125, + "step": 776 + }, + { + "epoch": 0.15417431420209335, + "grad_norm": 34.45396110912476, + "learning_rate": 9.920386041735643e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 4.08203125, + "logps/chosen": -733.0, + "logps/rejected": -783.0, + "loss": 0.5328, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.0576171875, + "rewards/margins": 4.42578125, + "rewards/rejected": -3.3671875, + "step": 777 + }, + { + "epoch": 0.15437273674289398, + "grad_norm": 50.781078817375835, + "learning_rate": 9.919801317117838e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 4.30078125, + "logps/chosen": -1112.0, + "logps/rejected": -1515.0, + "loss": 0.6618, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.7587890625, + "rewards/margins": 4.7109375, + "rewards/rejected": -3.95703125, + "step": 778 + }, + { + "epoch": 0.15457115928369464, + "grad_norm": 40.26300749339485, + "learning_rate": 9.919214472399851e-07, + "logits/chosen": 4.37109375, + "logits/rejected": 4.55078125, + "logps/chosen": -1055.0, + "logps/rejected": -1727.0, + "loss": 0.4366, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.7841796875, + "rewards/margins": 6.765625, + "rewards/rejected": -4.98828125, + "step": 779 + }, + { + "epoch": 0.15476958182449527, + "grad_norm": 40.622572299225105, + "learning_rate": 9.918625507863181e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.84765625, + "logps/chosen": -1213.0, + "logps/rejected": -813.0, + "loss": 0.4388, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.03515625, + "rewards/margins": 5.90625, + "rewards/rejected": -3.8671875, + "step": 780 + }, + { + "epoch": 0.1549680043652959, + "grad_norm": 36.1321511750219, + "learning_rate": 9.918034423790345e-07, + "logits/chosen": 4.65625, + "logits/rejected": 4.5078125, + "logps/chosen": -985.0, + "logps/rejected": -656.5, + "loss": 0.4344, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.7080078125, + "rewards/margins": 5.61328125, + "rewards/rejected": -3.8984375, + "step": 781 + }, + { + "epoch": 0.15516642690609653, + "grad_norm": 41.537062988415556, + "learning_rate": 9.917441220464874e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.18359375, + "logps/chosen": -792.0, + "logps/rejected": -1629.0, + "loss": 0.5425, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.41357421875, + "rewards/margins": 6.87890625, + "rewards/rejected": -5.45703125, + "step": 782 + }, + { + "epoch": 0.15536484944689716, + "grad_norm": 42.314929388304705, + "learning_rate": 9.91684589817132e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.1484375, + "logps/chosen": -1125.0, + "logps/rejected": -1302.0, + "loss": 0.5181, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.5380859375, + "rewards/margins": 5.83984375, + "rewards/rejected": -4.296875, + "step": 783 + }, + { + "epoch": 0.15556327198769782, + "grad_norm": 41.275217195639165, + "learning_rate": 9.916248457195244e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 4.0390625, + "logps/chosen": -822.0, + "logps/rejected": -668.5, + "loss": 0.5873, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.984375, + "rewards/margins": 3.6015625, + "rewards/rejected": -2.615234375, + "step": 784 + }, + { + "epoch": 0.15576169452849845, + "grad_norm": 39.44238506244204, + "learning_rate": 9.915648897823232e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 4.05078125, + "logps/chosen": -970.0, + "logps/rejected": -709.5, + "loss": 0.4152, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.2552490234375, + "rewards/margins": 5.11328125, + "rewards/rejected": -3.86328125, + "step": 785 + }, + { + "epoch": 0.15596011706929908, + "grad_norm": 32.93417811257873, + "learning_rate": 9.915047220342878e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.11328125, + "logps/chosen": -953.0, + "logps/rejected": -487.5, + "loss": 0.4257, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.3818359375, + "rewards/margins": 4.6640625, + "rewards/rejected": -3.271484375, + "step": 786 + }, + { + "epoch": 0.1561585396100997, + "grad_norm": 44.80550663944957, + "learning_rate": 9.914443425042799e-07, + "logits/chosen": 3.388671875, + "logits/rejected": 3.3671875, + "logps/chosen": -651.0, + "logps/rejected": -533.0, + "loss": 0.5601, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9091796875, + "rewards/margins": 3.6015625, + "rewards/rejected": -2.693359375, + "step": 787 + }, + { + "epoch": 0.15635696215090034, + "grad_norm": 37.67963546055921, + "learning_rate": 9.913837512212623e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 3.9375, + "logps/chosen": -1068.0, + "logps/rejected": -758.0, + "loss": 0.4443, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.82421875, + "rewards/margins": 4.90234375, + "rewards/rejected": -3.0859375, + "step": 788 + }, + { + "epoch": 0.15655538469170097, + "grad_norm": 39.66677936054083, + "learning_rate": 9.913229482142997e-07, + "logits/chosen": 3.546875, + "logits/rejected": 3.61328125, + "logps/chosen": -1080.0, + "logps/rejected": -652.0, + "loss": 0.6173, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.81982421875, + "rewards/margins": 3.830078125, + "rewards/rejected": -3.01171875, + "step": 789 + }, + { + "epoch": 0.15675380723250162, + "grad_norm": 31.21391055138516, + "learning_rate": 9.912619335125583e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.86328125, + "logps/chosen": -564.5, + "logps/rejected": -1623.0, + "loss": 0.5012, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.0048828125, + "rewards/margins": 6.5078125, + "rewards/rejected": -5.50390625, + "step": 790 + }, + { + "epoch": 0.15695222977330225, + "grad_norm": 37.12970149167072, + "learning_rate": 9.912007071453056e-07, + "logits/chosen": 3.63671875, + "logits/rejected": 4.0078125, + "logps/chosen": -1186.0, + "logps/rejected": -909.0, + "loss": 0.4412, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.9482421875, + "rewards/margins": 6.05078125, + "rewards/rejected": -4.10546875, + "step": 791 + }, + { + "epoch": 0.15715065231410288, + "grad_norm": 37.68405078538573, + "learning_rate": 9.911392691419108e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 4.046875, + "logps/chosen": -843.0, + "logps/rejected": -626.0, + "loss": 0.5685, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.052734375, + "rewards/margins": 4.068359375, + "rewards/rejected": -3.015625, + "step": 792 + }, + { + "epoch": 0.15734907485490351, + "grad_norm": 34.43522829983623, + "learning_rate": 9.910776195318447e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 3.765625, + "logps/chosen": -918.5, + "logps/rejected": -632.5, + "loss": 0.4945, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8203125, + "rewards/margins": 5.046875, + "rewards/rejected": -3.234375, + "step": 793 + }, + { + "epoch": 0.15754749739570414, + "grad_norm": 41.939580415716165, + "learning_rate": 9.910157583446796e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 3.96875, + "logps/chosen": -1186.0, + "logps/rejected": -809.0, + "loss": 0.5015, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5498046875, + "rewards/margins": 5.69921875, + "rewards/rejected": -5.1484375, + "step": 794 + }, + { + "epoch": 0.15774591993650477, + "grad_norm": 38.94132784310226, + "learning_rate": 9.90953685610089e-07, + "logits/chosen": 4.19921875, + "logits/rejected": 4.3046875, + "logps/chosen": -1178.0, + "logps/rejected": -907.0, + "loss": 0.4287, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.15625, + "rewards/margins": 6.2890625, + "rewards/rejected": -4.130859375, + "step": 795 + }, + { + "epoch": 0.15794434247730543, + "grad_norm": 33.972211167338834, + "learning_rate": 9.90891401357848e-07, + "logits/chosen": 3.515625, + "logits/rejected": 3.83984375, + "logps/chosen": -1111.0, + "logps/rejected": -785.5, + "loss": 0.5002, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.7421875, + "rewards/margins": 6.0, + "rewards/rejected": -4.26171875, + "step": 796 + }, + { + "epoch": 0.15814276501810606, + "grad_norm": 40.310736192614286, + "learning_rate": 9.908289056178337e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 4.359375, + "logps/chosen": -1051.0, + "logps/rejected": -882.0, + "loss": 0.4343, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.51416015625, + "rewards/margins": 6.65625, + "rewards/rejected": -5.125, + "step": 797 + }, + { + "epoch": 0.1583411875589067, + "grad_norm": 44.96816739434785, + "learning_rate": 9.907661984200238e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 4.02734375, + "logps/chosen": -957.0, + "logps/rejected": -740.0, + "loss": 0.414, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.8056640625, + "rewards/margins": 5.7265625, + "rewards/rejected": -3.91796875, + "step": 798 + }, + { + "epoch": 0.15853961009970732, + "grad_norm": 44.11143848589651, + "learning_rate": 9.90703279794498e-07, + "logits/chosen": 4.15625, + "logits/rejected": 3.84375, + "logps/chosen": -1141.0, + "logps/rejected": -943.0, + "loss": 0.4006, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.591796875, + "rewards/margins": 6.296875, + "rewards/rejected": -4.6953125, + "step": 799 + }, + { + "epoch": 0.15873803264050795, + "grad_norm": 44.165066121004905, + "learning_rate": 9.906401497714373e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 4.3671875, + "logps/chosen": -970.0, + "logps/rejected": -671.5, + "loss": 0.7688, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.271240234375, + "rewards/margins": 2.10546875, + "rewards/rejected": -2.376953125, + "step": 800 + }, + { + "epoch": 0.15893645518130858, + "grad_norm": 45.986132873480656, + "learning_rate": 9.90576808381124e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 4.015625, + "logps/chosen": -690.0, + "logps/rejected": -441.0, + "loss": 0.5369, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.30859375, + "rewards/margins": 3.9296875, + "rewards/rejected": -2.619140625, + "step": 801 + }, + { + "epoch": 0.15913487772210924, + "grad_norm": 33.04130356190822, + "learning_rate": 9.905132556539418e-07, + "logits/chosen": 3.84375, + "logits/rejected": 3.98828125, + "logps/chosen": -943.0, + "logps/rejected": -589.5, + "loss": 0.5341, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.1646728515625, + "rewards/margins": 4.56640625, + "rewards/rejected": -3.408203125, + "step": 802 + }, + { + "epoch": 0.15933330026290987, + "grad_norm": 42.92772990281708, + "learning_rate": 9.904494916203758e-07, + "logits/chosen": 4.42578125, + "logits/rejected": 4.8671875, + "logps/chosen": -1024.0, + "logps/rejected": -1042.0, + "loss": 0.4673, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8515625, + "rewards/margins": 5.26953125, + "rewards/rejected": -3.419921875, + "step": 803 + }, + { + "epoch": 0.1595317228037105, + "grad_norm": 39.15043365950197, + "learning_rate": 9.903855163110123e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 3.81640625, + "logps/chosen": -2030.0, + "logps/rejected": -615.0, + "loss": 0.5145, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.05078125, + "rewards/margins": 2.0146484375, + "rewards/rejected": -3.0703125, + "step": 804 + }, + { + "epoch": 0.15973014534451113, + "grad_norm": 47.0026863718609, + "learning_rate": 9.903213297565396e-07, + "logits/chosen": 4.3828125, + "logits/rejected": 4.421875, + "logps/chosen": -1197.0, + "logps/rejected": -905.0, + "loss": 0.4959, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.625, + "rewards/margins": 5.67578125, + "rewards/rejected": -4.05078125, + "step": 805 + }, + { + "epoch": 0.15992856788531176, + "grad_norm": 47.70642936715322, + "learning_rate": 9.902569319877463e-07, + "logits/chosen": 3.796875, + "logits/rejected": 4.17578125, + "logps/chosen": -1037.0, + "logps/rejected": -898.0, + "loss": 0.4134, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.423828125, + "rewards/margins": 5.6796875, + "rewards/rejected": -4.25, + "step": 806 + }, + { + "epoch": 0.16012699042611242, + "grad_norm": 49.422292921679166, + "learning_rate": 9.901923230355234e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 4.2734375, + "logps/chosen": -1155.0, + "logps/rejected": -789.0, + "loss": 0.4167, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.556640625, + "rewards/margins": 5.875, + "rewards/rejected": -4.3125, + "step": 807 + }, + { + "epoch": 0.16032541296691305, + "grad_norm": 42.40611182682183, + "learning_rate": 9.901275029308622e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.55078125, + "logps/chosen": -871.0, + "logps/rejected": -839.5, + "loss": 0.4657, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2958984375, + "rewards/margins": 5.82421875, + "rewards/rejected": -4.546875, + "step": 808 + }, + { + "epoch": 0.16052383550771368, + "grad_norm": 46.240085334443826, + "learning_rate": 9.900624717048561e-07, + "logits/chosen": 3.72265625, + "logits/rejected": 3.87890625, + "logps/chosen": -895.0, + "logps/rejected": -792.0, + "loss": 0.5776, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.72900390625, + "rewards/margins": 4.5859375, + "rewards/rejected": -3.85546875, + "step": 809 + }, + { + "epoch": 0.1607222580485143, + "grad_norm": 38.83634187910216, + "learning_rate": 9.89997229388699e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.9140625, + "logps/chosen": -806.5, + "logps/rejected": -835.0, + "loss": 0.6547, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.728515625, + "rewards/margins": 4.17578125, + "rewards/rejected": -3.44140625, + "step": 810 + }, + { + "epoch": 0.16092068058931494, + "grad_norm": 38.12263678915796, + "learning_rate": 9.899317760136871e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 3.8671875, + "logps/chosen": -1405.0, + "logps/rejected": -534.0, + "loss": 0.5868, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.08935546875, + "rewards/margins": 2.73046875, + "rewards/rejected": -2.814453125, + "step": 811 + }, + { + "epoch": 0.16111910313011557, + "grad_norm": 37.20336875614293, + "learning_rate": 9.898661116112167e-07, + "logits/chosen": 3.875, + "logits/rejected": 4.14453125, + "logps/chosen": -1255.0, + "logps/rejected": -1302.5, + "loss": 0.579, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.63671875, + "rewards/margins": 6.26953125, + "rewards/rejected": -4.640625, + "step": 812 + }, + { + "epoch": 0.16131752567091623, + "grad_norm": 42.34934857509767, + "learning_rate": 9.89800236212786e-07, + "logits/chosen": 4.0, + "logits/rejected": 4.12890625, + "logps/chosen": -958.0, + "logps/rejected": -712.0, + "loss": 0.5997, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.0927734375, + "rewards/margins": 4.01953125, + "rewards/rejected": -2.931640625, + "step": 813 + }, + { + "epoch": 0.16151594821171686, + "grad_norm": 37.97143158032641, + "learning_rate": 9.897341498499943e-07, + "logits/chosen": 3.953125, + "logits/rejected": 3.703125, + "logps/chosen": -947.0, + "logps/rejected": -578.0, + "loss": 0.606, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.00634765625, + "rewards/margins": 3.1953125, + "rewards/rejected": -2.18359375, + "step": 814 + }, + { + "epoch": 0.1617143707525175, + "grad_norm": 44.71764108864572, + "learning_rate": 9.89667852554542e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.83984375, + "logps/chosen": -1349.0, + "logps/rejected": -947.0, + "loss": 0.5018, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.84765625, + "rewards/margins": 5.15234375, + "rewards/rejected": -3.314453125, + "step": 815 + }, + { + "epoch": 0.16191279329331812, + "grad_norm": 33.392702787463875, + "learning_rate": 9.896013443582308e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 3.67578125, + "logps/chosen": -1012.0, + "logps/rejected": -655.0, + "loss": 0.4518, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.533203125, + "rewards/margins": 4.69140625, + "rewards/rejected": -3.162109375, + "step": 816 + }, + { + "epoch": 0.16211121583411875, + "grad_norm": 35.644815948692575, + "learning_rate": 9.895346252929634e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.1875, + "logps/chosen": -1167.5, + "logps/rejected": -948.0, + "loss": 0.522, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.52734375, + "rewards/margins": 5.4140625, + "rewards/rejected": -3.87890625, + "step": 817 + }, + { + "epoch": 0.16230963837491938, + "grad_norm": 43.94416492252533, + "learning_rate": 9.89467695390744e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 4.125, + "logps/chosen": -1395.0, + "logps/rejected": -902.0, + "loss": 0.3783, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.2958984375, + "rewards/margins": 7.21875, + "rewards/rejected": -4.9140625, + "step": 818 + }, + { + "epoch": 0.16250806091572004, + "grad_norm": 39.42673436719143, + "learning_rate": 9.894005546836775e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 4.26953125, + "logps/chosen": -930.0, + "logps/rejected": -797.0, + "loss": 0.483, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.2275390625, + "rewards/margins": 5.23046875, + "rewards/rejected": -3.99609375, + "step": 819 + }, + { + "epoch": 0.16270648345652067, + "grad_norm": 46.767925709034834, + "learning_rate": 9.8933320320397e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.5078125, + "logps/chosen": -961.0, + "logps/rejected": -1003.0, + "loss": 0.4955, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.44921875, + "rewards/margins": 4.82421875, + "rewards/rejected": -3.375, + "step": 820 + }, + { + "epoch": 0.1629049059973213, + "grad_norm": 42.62856903919613, + "learning_rate": 9.89265640983929e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.4765625, + "logps/chosen": -1161.0, + "logps/rejected": -1000.0, + "loss": 0.5755, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.435546875, + "rewards/margins": 5.86328125, + "rewards/rejected": -4.427734375, + "step": 821 + }, + { + "epoch": 0.16310332853812193, + "grad_norm": 38.561166526237315, + "learning_rate": 9.891978680559627e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 4.2734375, + "logps/chosen": -840.0, + "logps/rejected": -497.0, + "loss": 0.4349, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.3232421875, + "rewards/margins": 6.4921875, + "rewards/rejected": -5.17578125, + "step": 822 + }, + { + "epoch": 0.16330175107892256, + "grad_norm": 36.05308462571683, + "learning_rate": 9.891298844525807e-07, + "logits/chosen": 3.5234375, + "logits/rejected": 3.66015625, + "logps/chosen": -770.0, + "logps/rejected": -1375.0, + "loss": 0.5231, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.712890625, + "rewards/margins": 6.16015625, + "rewards/rejected": -5.44140625, + "step": 823 + }, + { + "epoch": 0.1635001736197232, + "grad_norm": 42.54747719296175, + "learning_rate": 9.890616902063935e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 4.00390625, + "logps/chosen": -993.0, + "logps/rejected": -858.0, + "loss": 0.4708, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.423828125, + "rewards/margins": 5.158203125, + "rewards/rejected": -3.72265625, + "step": 824 + }, + { + "epoch": 0.16369859616052385, + "grad_norm": 45.472033532066874, + "learning_rate": 9.889932853501129e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 4.05078125, + "logps/chosen": -1084.0, + "logps/rejected": -1503.5, + "loss": 0.5387, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.673828125, + "rewards/margins": 6.05078125, + "rewards/rejected": -4.376953125, + "step": 825 + }, + { + "epoch": 0.16389701870132448, + "grad_norm": 44.9806332290295, + "learning_rate": 9.88924669916551e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.35546875, + "logps/chosen": -881.0, + "logps/rejected": -635.5, + "loss": 0.5034, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.0224609375, + "rewards/margins": 4.578125, + "rewards/rejected": -3.55078125, + "step": 826 + }, + { + "epoch": 0.1640954412421251, + "grad_norm": 34.56582432534759, + "learning_rate": 9.88855843938622e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 3.91015625, + "logps/chosen": -1020.0, + "logps/rejected": -635.5, + "loss": 0.552, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.197265625, + "rewards/margins": 4.296875, + "rewards/rejected": -3.0859375, + "step": 827 + }, + { + "epoch": 0.16429386378292574, + "grad_norm": 48.83305799589915, + "learning_rate": 9.887868074493398e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.15625, + "logps/chosen": -1046.0, + "logps/rejected": -825.0, + "loss": 0.4221, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.490966796875, + "rewards/margins": 5.802734375, + "rewards/rejected": -4.314453125, + "step": 828 + }, + { + "epoch": 0.16449228632372637, + "grad_norm": 39.223649120421996, + "learning_rate": 9.887175604818206e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 4.15234375, + "logps/chosen": -1339.0, + "logps/rejected": -876.0, + "loss": 0.396, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.24609375, + "rewards/margins": 6.84375, + "rewards/rejected": -4.59375, + "step": 829 + }, + { + "epoch": 0.164690708864527, + "grad_norm": 43.420392398230604, + "learning_rate": 9.886481030692806e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.29296875, + "logps/chosen": -1329.0, + "logps/rejected": -1108.0, + "loss": 0.3827, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.52734375, + "rewards/margins": 6.00390625, + "rewards/rejected": -4.474609375, + "step": 830 + }, + { + "epoch": 0.16488913140532765, + "grad_norm": 32.79134870151881, + "learning_rate": 9.885784352450375e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.5625, + "logps/chosen": -895.0, + "logps/rejected": -669.0, + "loss": 0.4493, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.32421875, + "rewards/margins": 5.765625, + "rewards/rejected": -3.4453125, + "step": 831 + }, + { + "epoch": 0.16508755394612828, + "grad_norm": 44.777441011793115, + "learning_rate": 9.885085570425096e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 3.97265625, + "logps/chosen": -1039.0, + "logps/rejected": -546.0, + "loss": 0.5093, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.416015625, + "rewards/margins": 4.125, + "rewards/rejected": -2.70703125, + "step": 832 + }, + { + "epoch": 0.16528597648692891, + "grad_norm": 45.12279985119465, + "learning_rate": 9.88438468495216e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.171875, + "logps/chosen": -1202.0, + "logps/rejected": -1221.0, + "loss": 0.4436, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.3837890625, + "rewards/margins": 6.7109375, + "rewards/rejected": -5.322265625, + "step": 833 + }, + { + "epoch": 0.16548439902772955, + "grad_norm": 34.71723440055554, + "learning_rate": 9.883681696367773e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 4.171875, + "logps/chosen": -727.0, + "logps/rejected": -467.75, + "loss": 0.5634, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.951171875, + "rewards/margins": 3.640625, + "rewards/rejected": -2.6884765625, + "step": 834 + }, + { + "epoch": 0.16568282156853018, + "grad_norm": 35.00894639325315, + "learning_rate": 9.882976605009146e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 3.97265625, + "logps/chosen": -852.0, + "logps/rejected": -615.5, + "loss": 0.4538, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4462890625, + "rewards/margins": 4.47265625, + "rewards/rejected": -3.02734375, + "step": 835 + }, + { + "epoch": 0.16588124410933083, + "grad_norm": 43.584682117129375, + "learning_rate": 9.882269411214496e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 3.9609375, + "logps/chosen": -984.0, + "logps/rejected": -525.0, + "loss": 0.5238, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2459716796875, + "rewards/margins": 3.494140625, + "rewards/rejected": -2.251953125, + "step": 836 + }, + { + "epoch": 0.16607966665013146, + "grad_norm": 34.268118663155995, + "learning_rate": 9.881560115323055e-07, + "logits/chosen": 4.046875, + "logits/rejected": 4.15234375, + "logps/chosen": -976.0, + "logps/rejected": -642.5, + "loss": 0.5376, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.732421875, + "rewards/margins": 4.33203125, + "rewards/rejected": -2.595703125, + "step": 837 + }, + { + "epoch": 0.1662780891909321, + "grad_norm": 38.111018921168764, + "learning_rate": 9.880848717675054e-07, + "logits/chosen": 4.32421875, + "logits/rejected": 4.6875, + "logps/chosen": -1068.0, + "logps/rejected": -1415.0, + "loss": 0.5139, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.93115234375, + "rewards/margins": 5.97265625, + "rewards/rejected": -4.0546875, + "step": 838 + }, + { + "epoch": 0.16647651173173272, + "grad_norm": 34.444809632396854, + "learning_rate": 9.880135218611745e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 4.0234375, + "logps/chosen": -1040.0, + "logps/rejected": -566.75, + "loss": 0.5803, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.5576171875, + "rewards/margins": 4.17578125, + "rewards/rejected": -2.623046875, + "step": 839 + }, + { + "epoch": 0.16667493427253335, + "grad_norm": 33.19869182125766, + "learning_rate": 9.879419618475375e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.05859375, + "logps/chosen": -1022.0, + "logps/rejected": -745.0, + "loss": 0.3841, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.021484375, + "rewards/margins": 5.84765625, + "rewards/rejected": -3.833984375, + "step": 840 + }, + { + "epoch": 0.16687335681333398, + "grad_norm": 36.45255437741379, + "learning_rate": 9.878701917609207e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.31640625, + "logps/chosen": -909.0, + "logps/rejected": -816.0, + "loss": 0.4818, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.544921875, + "rewards/margins": 5.46875, + "rewards/rejected": -3.921875, + "step": 841 + }, + { + "epoch": 0.16707177935413464, + "grad_norm": 43.16626006103148, + "learning_rate": 9.877982116357508e-07, + "logits/chosen": 3.48828125, + "logits/rejected": 3.76953125, + "logps/chosen": -739.0, + "logps/rejected": -697.0, + "loss": 0.5926, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8896484375, + "rewards/margins": 4.32421875, + "rewards/rejected": -3.431640625, + "step": 842 + }, + { + "epoch": 0.16727020189493527, + "grad_norm": 37.50660301634821, + "learning_rate": 9.87726021506556e-07, + "logits/chosen": 3.3359375, + "logits/rejected": 3.69140625, + "logps/chosen": -762.5, + "logps/rejected": -515.5, + "loss": 0.6225, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.138916015625, + "rewards/margins": 3.9296875, + "rewards/rejected": -2.78515625, + "step": 843 + }, + { + "epoch": 0.1674686244357359, + "grad_norm": 44.75900132685911, + "learning_rate": 9.876536214079638e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 3.8125, + "logps/chosen": -1081.0, + "logps/rejected": -624.0, + "loss": 0.4175, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.408203125, + "rewards/margins": 5.4375, + "rewards/rejected": -4.03125, + "step": 844 + }, + { + "epoch": 0.16766704697653653, + "grad_norm": 28.214828419842778, + "learning_rate": 9.875810113747034e-07, + "logits/chosen": 4.8359375, + "logits/rejected": 4.796875, + "logps/chosen": -1245.0, + "logps/rejected": -806.0, + "loss": 0.3942, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.442474365234375, + "rewards/margins": 6.390625, + "rewards/rejected": -3.94140625, + "step": 845 + }, + { + "epoch": 0.16786546951733716, + "grad_norm": 35.06349456188174, + "learning_rate": 9.87508191441605e-07, + "logits/chosen": 4.046875, + "logits/rejected": 4.3046875, + "logps/chosen": -949.0, + "logps/rejected": -652.0, + "loss": 0.4538, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.6015625, + "rewards/margins": 5.150390625, + "rewards/rejected": -3.55859375, + "step": 846 + }, + { + "epoch": 0.1680638920581378, + "grad_norm": 47.38201093633971, + "learning_rate": 9.874351616435985e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.32421875, + "logps/chosen": -1100.0, + "logps/rejected": -832.0, + "loss": 0.4493, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.90185546875, + "rewards/margins": 4.85546875, + "rewards/rejected": -3.94921875, + "step": 847 + }, + { + "epoch": 0.16826231459893845, + "grad_norm": 33.63894443124559, + "learning_rate": 9.873619220157154e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 4.171875, + "logps/chosen": -793.0, + "logps/rejected": -811.0, + "loss": 0.4039, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.634765625, + "rewards/margins": 5.5703125, + "rewards/rejected": -3.93359375, + "step": 848 + }, + { + "epoch": 0.16846073713973908, + "grad_norm": 37.16050060734852, + "learning_rate": 9.872884725930871e-07, + "logits/chosen": 3.96875, + "logits/rejected": 3.8046875, + "logps/chosen": -1165.0, + "logps/rejected": -1266.75, + "loss": 0.4789, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8828125, + "rewards/margins": 5.197265625, + "rewards/rejected": -4.30078125, + "step": 849 + }, + { + "epoch": 0.1686591596805397, + "grad_norm": 41.95737134625518, + "learning_rate": 9.872148134109464e-07, + "logits/chosen": 3.671875, + "logits/rejected": 3.82421875, + "logps/chosen": -1106.0, + "logps/rejected": -761.0, + "loss": 0.4118, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.67578125, + "rewards/margins": 5.4375, + "rewards/rejected": -3.7734375, + "step": 850 + }, + { + "epoch": 0.16885758222134034, + "grad_norm": 40.40253420384121, + "learning_rate": 9.871409445046261e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.1953125, + "logps/chosen": -900.0, + "logps/rejected": -922.5, + "loss": 0.4697, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.759765625, + "rewards/margins": 5.1875, + "rewards/rejected": -3.43359375, + "step": 851 + }, + { + "epoch": 0.16905600476214097, + "grad_norm": 30.822128367690578, + "learning_rate": 9.870668659095597e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.09765625, + "logps/chosen": -1440.0, + "logps/rejected": -735.0, + "loss": 0.3094, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4921875, + "rewards/margins": 6.4921875, + "rewards/rejected": -4.0, + "step": 852 + }, + { + "epoch": 0.1692544273029416, + "grad_norm": 36.59167920419148, + "learning_rate": 9.869925776612815e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.2734375, + "logps/chosen": -1071.0, + "logps/rejected": -592.5, + "loss": 0.4296, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.796875, + "rewards/margins": 4.6640625, + "rewards/rejected": -2.86328125, + "step": 853 + }, + { + "epoch": 0.16945284984374226, + "grad_norm": 39.29935759231889, + "learning_rate": 9.86918079795426e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 4.2890625, + "logps/chosen": -941.0, + "logps/rejected": -697.5, + "loss": 0.492, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.7587890625, + "rewards/margins": 4.19140625, + "rewards/rejected": -2.431640625, + "step": 854 + }, + { + "epoch": 0.1696512723845429, + "grad_norm": 41.730031033980865, + "learning_rate": 9.86843372347729e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 3.91796875, + "logps/chosen": -1014.0, + "logps/rejected": -1464.0, + "loss": 0.5278, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.55859375, + "rewards/margins": 5.1796875, + "rewards/rejected": -3.62109375, + "step": 855 + }, + { + "epoch": 0.16984969492534352, + "grad_norm": 43.647372433663634, + "learning_rate": 9.86768455354026e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 3.859375, + "logps/chosen": -756.0, + "logps/rejected": -644.0, + "loss": 0.5076, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.4326171875, + "rewards/margins": 4.6328125, + "rewards/rejected": -3.205078125, + "step": 856 + }, + { + "epoch": 0.17004811746614415, + "grad_norm": 33.55495034972409, + "learning_rate": 9.866933288502533e-07, + "logits/chosen": 3.74609375, + "logits/rejected": 3.859375, + "logps/chosen": -983.0, + "logps/rejected": -682.5, + "loss": 0.4053, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.73046875, + "rewards/margins": 5.0078125, + "rewards/rejected": -3.26953125, + "step": 857 + }, + { + "epoch": 0.17024654000694478, + "grad_norm": 34.75011284397446, + "learning_rate": 9.866179928724478e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 3.9140625, + "logps/chosen": -1047.0, + "logps/rejected": -661.0, + "loss": 0.4536, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.654296875, + "rewards/margins": 4.703125, + "rewards/rejected": -3.05859375, + "step": 858 + }, + { + "epoch": 0.17044496254774544, + "grad_norm": 47.256466379257326, + "learning_rate": 9.86542447456747e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 3.90234375, + "logps/chosen": -1005.5, + "logps/rejected": -660.0, + "loss": 0.4381, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.20703125, + "rewards/margins": 4.6796875, + "rewards/rejected": -2.47265625, + "step": 859 + }, + { + "epoch": 0.17064338508854607, + "grad_norm": 35.00160160743699, + "learning_rate": 9.864666926393884e-07, + "logits/chosen": 4.22265625, + "logits/rejected": 4.1640625, + "logps/chosen": -802.0, + "logps/rejected": -694.0, + "loss": 0.5169, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0302734375, + "rewards/margins": 4.6015625, + "rewards/rejected": -2.576171875, + "step": 860 + }, + { + "epoch": 0.1708418076293467, + "grad_norm": 45.112386586071054, + "learning_rate": 9.863907284567107e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.11328125, + "logps/chosen": -891.0, + "logps/rejected": -657.0, + "loss": 0.4574, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4931640625, + "rewards/margins": 4.06640625, + "rewards/rejected": -2.572265625, + "step": 861 + }, + { + "epoch": 0.17104023017014733, + "grad_norm": 39.01284589202795, + "learning_rate": 9.863145549451522e-07, + "logits/chosen": 3.78125, + "logits/rejected": 3.7734375, + "logps/chosen": -933.0, + "logps/rejected": -769.0, + "loss": 0.493, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.6875, + "rewards/margins": 4.47265625, + "rewards/rejected": -2.791015625, + "step": 862 + }, + { + "epoch": 0.17123865271094796, + "grad_norm": 46.20689927797559, + "learning_rate": 9.86238172141252e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 3.93359375, + "logps/chosen": -1462.0, + "logps/rejected": -607.5, + "loss": 0.5039, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.953125, + "rewards/margins": 3.640625, + "rewards/rejected": -2.685546875, + "step": 863 + }, + { + "epoch": 0.1714370752517486, + "grad_norm": 34.85174208600791, + "learning_rate": 9.861615800816497e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 3.7421875, + "logps/chosen": -1075.0, + "logps/rejected": -629.0, + "loss": 0.3713, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.73828125, + "rewards/margins": 5.625, + "rewards/rejected": -3.8828125, + "step": 864 + }, + { + "epoch": 0.17163549779254925, + "grad_norm": 35.28334312706669, + "learning_rate": 9.86084778803085e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.0859375, + "logps/chosen": -1087.5, + "logps/rejected": -1181.5, + "loss": 0.4663, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.833984375, + "rewards/margins": 6.10546875, + "rewards/rejected": -4.275390625, + "step": 865 + }, + { + "epoch": 0.17183392033334988, + "grad_norm": 41.335247016814, + "learning_rate": 9.860077683423982e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.3359375, + "logps/chosen": -930.5, + "logps/rejected": -544.25, + "loss": 0.5319, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.73828125, + "rewards/margins": 3.5625, + "rewards/rejected": -1.8310546875, + "step": 866 + }, + { + "epoch": 0.1720323428741505, + "grad_norm": 31.611868395714293, + "learning_rate": 9.8593054873653e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.3984375, + "logps/chosen": -767.0, + "logps/rejected": -533.0, + "loss": 0.617, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.5166015625, + "rewards/margins": 3.6806640625, + "rewards/rejected": -2.1630859375, + "step": 867 + }, + { + "epoch": 0.17223076541495114, + "grad_norm": 40.04248807479401, + "learning_rate": 9.85853120022521e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.46875, + "logps/chosen": -1174.0, + "logps/rejected": -752.0, + "loss": 0.4434, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.041015625, + "rewards/margins": 5.796875, + "rewards/rejected": -3.76171875, + "step": 868 + }, + { + "epoch": 0.17242918795575177, + "grad_norm": 42.49779387030601, + "learning_rate": 9.857754822375126e-07, + "logits/chosen": 3.91015625, + "logits/rejected": 3.90625, + "logps/chosen": -828.5, + "logps/rejected": -611.5, + "loss": 0.5351, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.21875, + "rewards/margins": 4.0234375, + "rewards/rejected": -2.80859375, + "step": 869 + }, + { + "epoch": 0.1726276104965524, + "grad_norm": 40.17884095634145, + "learning_rate": 9.856976354187461e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 3.93359375, + "logps/chosen": -1527.0, + "logps/rejected": -790.5, + "loss": 0.2824, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.8251953125, + "rewards/margins": 7.4609375, + "rewards/rejected": -5.6328125, + "step": 870 + }, + { + "epoch": 0.17282603303735306, + "grad_norm": 52.703701522733745, + "learning_rate": 9.856195796035634e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.15234375, + "logps/chosen": -921.0, + "logps/rejected": -668.0, + "loss": 0.4737, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.49609375, + "rewards/margins": 5.4453125, + "rewards/rejected": -3.955078125, + "step": 871 + }, + { + "epoch": 0.17302445557815369, + "grad_norm": 33.91928509047304, + "learning_rate": 9.855413148294063e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 4.31640625, + "logps/chosen": -887.0, + "logps/rejected": -525.5, + "loss": 0.4751, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.38055419921875, + "rewards/margins": 4.6796875, + "rewards/rejected": -3.310546875, + "step": 872 + }, + { + "epoch": 0.17322287811895432, + "grad_norm": 44.480232835822235, + "learning_rate": 9.854628411338171e-07, + "logits/chosen": 4.19921875, + "logits/rejected": 4.109375, + "logps/chosen": -882.5, + "logps/rejected": -648.0, + "loss": 0.5315, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.716796875, + "rewards/margins": 3.8984375, + "rewards/rejected": -2.177734375, + "step": 873 + }, + { + "epoch": 0.17342130065975495, + "grad_norm": 38.805057606086066, + "learning_rate": 9.853841585544384e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.07421875, + "logps/chosen": -1272.0, + "logps/rejected": -897.0, + "loss": 0.3091, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1015625, + "rewards/margins": 6.8984375, + "rewards/rejected": -4.796875, + "step": 874 + }, + { + "epoch": 0.17361972320055558, + "grad_norm": 39.895677719070974, + "learning_rate": 9.853052671290125e-07, + "logits/chosen": 3.56640625, + "logits/rejected": 3.46484375, + "logps/chosen": -1139.0, + "logps/rejected": -670.0, + "loss": 0.4032, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.7177734375, + "rewards/margins": 5.23046875, + "rewards/rejected": -3.51171875, + "step": 875 + }, + { + "epoch": 0.1738181457413562, + "grad_norm": 36.5197589491104, + "learning_rate": 9.852261668953824e-07, + "logits/chosen": 4.046875, + "logits/rejected": 4.125, + "logps/chosen": -930.0, + "logps/rejected": -974.0, + "loss": 0.4663, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.330810546875, + "rewards/margins": 13.28515625, + "rewards/rejected": -11.98046875, + "step": 876 + }, + { + "epoch": 0.17401656828215686, + "grad_norm": 32.35145932388004, + "learning_rate": 9.851468578914913e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.08984375, + "logps/chosen": -1271.0, + "logps/rejected": -759.5, + "loss": 0.3233, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.046875, + "rewards/margins": 7.0, + "rewards/rejected": -4.94921875, + "step": 877 + }, + { + "epoch": 0.1742149908229575, + "grad_norm": 39.66327455754156, + "learning_rate": 9.85067340155382e-07, + "logits/chosen": 3.79296875, + "logits/rejected": 3.7890625, + "logps/chosen": -1111.0, + "logps/rejected": -840.0, + "loss": 0.4113, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.794921875, + "rewards/margins": 5.6796875, + "rewards/rejected": -3.890625, + "step": 878 + }, + { + "epoch": 0.17441341336375812, + "grad_norm": 45.325857309319716, + "learning_rate": 9.849876137251976e-07, + "logits/chosen": 3.35546875, + "logits/rejected": 3.58984375, + "logps/chosen": -1061.0, + "logps/rejected": -1249.0, + "loss": 0.4352, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.11474609375, + "rewards/margins": 6.4453125, + "rewards/rejected": -5.3359375, + "step": 879 + }, + { + "epoch": 0.17461183590455875, + "grad_norm": 34.29041684190326, + "learning_rate": 9.84907678639182e-07, + "logits/chosen": 3.52734375, + "logits/rejected": 3.625, + "logps/chosen": -1037.0, + "logps/rejected": -851.0, + "loss": 0.5462, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.1787109375, + "rewards/margins": 9.7734375, + "rewards/rejected": -8.5625, + "step": 880 + }, + { + "epoch": 0.17481025844535938, + "grad_norm": 35.15433024636625, + "learning_rate": 9.848275349356783e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 3.92578125, + "logps/chosen": -918.0, + "logps/rejected": -603.5, + "loss": 0.5508, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.611328125, + "rewards/margins": 3.8359375, + "rewards/rejected": -2.220703125, + "step": 881 + }, + { + "epoch": 0.17500868098616001, + "grad_norm": 38.69086798260465, + "learning_rate": 9.847471826531299e-07, + "logits/chosen": 3.609375, + "logits/rejected": 3.66796875, + "logps/chosen": -1148.0, + "logps/rejected": -648.0, + "loss": 0.4607, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.0771484375, + "rewards/margins": 4.08984375, + "rewards/rejected": -3.01171875, + "step": 882 + }, + { + "epoch": 0.17520710352696067, + "grad_norm": 50.2196742132419, + "learning_rate": 9.846666218300807e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 3.953125, + "logps/chosen": -963.0, + "logps/rejected": -703.0, + "loss": 0.4751, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.83984375, + "rewards/margins": 4.5390625, + "rewards/rejected": -2.703125, + "step": 883 + }, + { + "epoch": 0.1754055260677613, + "grad_norm": 41.634415179383694, + "learning_rate": 9.84585852505174e-07, + "logits/chosen": 3.76953125, + "logits/rejected": 4.11328125, + "logps/chosen": -832.0, + "logps/rejected": -1387.0, + "loss": 0.5527, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.364990234375, + "rewards/margins": 5.9296875, + "rewards/rejected": -4.57421875, + "step": 884 + }, + { + "epoch": 0.17560394860856193, + "grad_norm": 38.47437140791208, + "learning_rate": 9.845048747171535e-07, + "logits/chosen": 3.74609375, + "logits/rejected": 3.70703125, + "logps/chosen": -1217.0, + "logps/rejected": -744.0, + "loss": 0.3397, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.0078125, + "rewards/margins": 6.1875, + "rewards/rejected": -4.1796875, + "step": 885 + }, + { + "epoch": 0.17580237114936256, + "grad_norm": 40.56892140343868, + "learning_rate": 9.84423688504863e-07, + "logits/chosen": 3.5546875, + "logits/rejected": 3.359375, + "logps/chosen": -1713.0, + "logps/rejected": -517.5, + "loss": 0.4946, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.591796875, + "rewards/margins": 2.28125, + "rewards/rejected": -3.8828125, + "step": 886 + }, + { + "epoch": 0.1760007936901632, + "grad_norm": 39.62211488536374, + "learning_rate": 9.843422939072458e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 3.97265625, + "logps/chosen": -926.0, + "logps/rejected": -864.0, + "loss": 0.5994, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.7099609375, + "rewards/margins": 4.28515625, + "rewards/rejected": -3.578125, + "step": 887 + }, + { + "epoch": 0.17619921623096385, + "grad_norm": 52.31850360018291, + "learning_rate": 9.842606909633456e-07, + "logits/chosen": 3.7109375, + "logits/rejected": 3.75390625, + "logps/chosen": -1072.0, + "logps/rejected": -1700.0, + "loss": 0.602, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.468505859375, + "rewards/margins": 6.71875, + "rewards/rejected": -6.24609375, + "step": 888 + }, + { + "epoch": 0.17639763877176448, + "grad_norm": 40.66824112164225, + "learning_rate": 9.841788797123056e-07, + "logits/chosen": 3.6171875, + "logits/rejected": 3.50390625, + "logps/chosen": -1124.5, + "logps/rejected": -719.5, + "loss": 0.5129, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.326171875, + "rewards/margins": 4.39453125, + "rewards/rejected": -2.060546875, + "step": 889 + }, + { + "epoch": 0.1765960613125651, + "grad_norm": 39.48996032828341, + "learning_rate": 9.840968601933697e-07, + "logits/chosen": 3.62109375, + "logits/rejected": 3.78515625, + "logps/chosen": -763.0, + "logps/rejected": -612.0, + "loss": 0.4981, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.267578125, + "rewards/margins": 4.55078125, + "rewards/rejected": -3.28515625, + "step": 890 + }, + { + "epoch": 0.17679448385336574, + "grad_norm": 41.28823575639752, + "learning_rate": 9.840146324458808e-07, + "logits/chosen": 3.66796875, + "logits/rejected": 3.7109375, + "logps/chosen": -1039.0, + "logps/rejected": -907.5, + "loss": 0.4314, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8447265625, + "rewards/margins": 6.0859375, + "rewards/rejected": -4.23828125, + "step": 891 + }, + { + "epoch": 0.17699290639416637, + "grad_norm": 39.192817088533886, + "learning_rate": 9.839321965092825e-07, + "logits/chosen": 3.73046875, + "logits/rejected": 4.08203125, + "logps/chosen": -1028.0, + "logps/rejected": -1055.0, + "loss": 0.5131, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9482421875, + "rewards/margins": 7.5078125, + "rewards/rejected": -6.5546875, + "step": 892 + }, + { + "epoch": 0.177191328934967, + "grad_norm": 40.313211037412295, + "learning_rate": 9.838495524231173e-07, + "logits/chosen": 3.796875, + "logits/rejected": 3.9765625, + "logps/chosen": -929.0, + "logps/rejected": -1026.0, + "loss": 0.5076, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.164794921875, + "rewards/margins": 5.3515625, + "rewards/rejected": -4.1796875, + "step": 893 + }, + { + "epoch": 0.17738975147576766, + "grad_norm": 35.883948878912705, + "learning_rate": 9.837667002270285e-07, + "logits/chosen": 3.77734375, + "logits/rejected": 3.9609375, + "logps/chosen": -975.0, + "logps/rejected": -1054.5, + "loss": 0.4025, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.2587890625, + "rewards/margins": 6.4140625, + "rewards/rejected": -5.15234375, + "step": 894 + }, + { + "epoch": 0.1775881740165683, + "grad_norm": 33.80933536395151, + "learning_rate": 9.836836399607588e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.27734375, + "logps/chosen": -565.5, + "logps/rejected": -1040.5, + "loss": 0.4748, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.59765625, + "rewards/margins": 6.4921875, + "rewards/rejected": -4.89453125, + "step": 895 + }, + { + "epoch": 0.17778659655736892, + "grad_norm": 42.295655812957364, + "learning_rate": 9.836003716641504e-07, + "logits/chosen": 3.734375, + "logits/rejected": 3.9296875, + "logps/chosen": -998.0, + "logps/rejected": -1118.5, + "loss": 0.4219, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.873046875, + "rewards/margins": 6.328125, + "rewards/rejected": -4.45703125, + "step": 896 + }, + { + "epoch": 0.17798501909816955, + "grad_norm": 43.55519691245079, + "learning_rate": 9.83516895377146e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 4.09765625, + "logps/chosen": -830.5, + "logps/rejected": -558.5, + "loss": 0.6179, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.208984375, + "rewards/margins": 4.01953125, + "rewards/rejected": -2.8125, + "step": 897 + }, + { + "epoch": 0.17818344163897018, + "grad_norm": 43.33952758689372, + "learning_rate": 9.834332111397878e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 4.1875, + "logps/chosen": -945.0, + "logps/rejected": -714.0, + "loss": 0.4587, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6728515625, + "rewards/margins": 5.98828125, + "rewards/rejected": -4.3046875, + "step": 898 + }, + { + "epoch": 0.1783818641797708, + "grad_norm": 31.791545825654875, + "learning_rate": 9.83349318992217e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.11328125, + "logps/chosen": -873.0, + "logps/rejected": -597.0, + "loss": 0.4241, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.08984375, + "rewards/margins": 4.59375, + "rewards/rejected": -3.5078125, + "step": 899 + }, + { + "epoch": 0.17858028672057147, + "grad_norm": 44.22925689508789, + "learning_rate": 9.83265218974676e-07, + "logits/chosen": 3.46875, + "logits/rejected": 4.02734375, + "logps/chosen": -956.0, + "logps/rejected": -1087.0, + "loss": 0.5792, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.71728515625, + "rewards/margins": 6.0, + "rewards/rejected": -5.28125, + "step": 900 + }, + { + "epoch": 0.1787787092613721, + "grad_norm": 45.672044265338364, + "learning_rate": 9.831809111275054e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 4.1796875, + "logps/chosen": -1347.0, + "logps/rejected": -1277.0, + "loss": 0.4649, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.30078125, + "rewards/margins": 5.328125, + "rewards/rejected": -3.03515625, + "step": 901 + }, + { + "epoch": 0.17897713180217273, + "grad_norm": 35.44011596366531, + "learning_rate": 9.830963954911465e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 3.609375, + "logps/chosen": -878.0, + "logps/rejected": -640.5, + "loss": 0.4475, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.3359375, + "rewards/margins": 5.125, + "rewards/rejected": -3.78125, + "step": 902 + }, + { + "epoch": 0.17917555434297336, + "grad_norm": 44.96184022241947, + "learning_rate": 9.8301167210614e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.02734375, + "logps/chosen": -1620.0, + "logps/rejected": -1133.0, + "loss": 0.3716, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.380859375, + "rewards/margins": 5.5859375, + "rewards/rejected": -3.20703125, + "step": 903 + }, + { + "epoch": 0.179373976883774, + "grad_norm": 31.908668709327145, + "learning_rate": 9.82926741013126e-07, + "logits/chosen": 3.69140625, + "logits/rejected": 3.72265625, + "logps/chosen": -1108.0, + "logps/rejected": -761.0, + "loss": 0.3665, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.91796875, + "rewards/margins": 6.875, + "rewards/rejected": -3.966796875, + "step": 904 + }, + { + "epoch": 0.17957239942457462, + "grad_norm": 48.142892179285674, + "learning_rate": 9.828416022528445e-07, + "logits/chosen": 3.921875, + "logits/rejected": 3.953125, + "logps/chosen": -1143.0, + "logps/rejected": -886.5, + "loss": 0.4271, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.8037109375, + "rewards/margins": 5.8359375, + "rewards/rejected": -4.03515625, + "step": 905 + }, + { + "epoch": 0.17977082196537528, + "grad_norm": 36.40923549982749, + "learning_rate": 9.82756255866135e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.25390625, + "logps/chosen": -895.5, + "logps/rejected": -1822.0, + "loss": 0.5062, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.587890625, + "rewards/margins": 8.375, + "rewards/rejected": -6.78125, + "step": 906 + }, + { + "epoch": 0.1799692445061759, + "grad_norm": 35.746414965034276, + "learning_rate": 9.826707018939368e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 3.8984375, + "logps/chosen": -1114.0, + "logps/rejected": -924.0, + "loss": 0.4117, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.7138671875, + "rewards/margins": 6.125, + "rewards/rejected": -4.41015625, + "step": 907 + }, + { + "epoch": 0.18016766704697654, + "grad_norm": 39.39559298962972, + "learning_rate": 9.825849403772885e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.328125, + "logps/chosen": -732.0, + "logps/rejected": -512.0, + "loss": 0.5338, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.3974609375, + "rewards/margins": 3.578125, + "rewards/rejected": -2.1796875, + "step": 908 + }, + { + "epoch": 0.18036608958777717, + "grad_norm": 40.945588368383106, + "learning_rate": 9.824989713573286e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.12109375, + "logps/chosen": -1339.0, + "logps/rejected": -1114.5, + "loss": 0.3992, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.638671875, + "rewards/margins": 8.046875, + "rewards/rejected": -6.3984375, + "step": 909 + }, + { + "epoch": 0.1805645121285778, + "grad_norm": 35.37129186126692, + "learning_rate": 9.824127948752948e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 4.08203125, + "logps/chosen": -1181.0, + "logps/rejected": -890.0, + "loss": 0.4165, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.1015625, + "rewards/margins": 6.4375, + "rewards/rejected": -4.328125, + "step": 910 + }, + { + "epoch": 0.18076293466937843, + "grad_norm": 49.1893284330336, + "learning_rate": 9.823264109725242e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.42578125, + "logps/chosen": -1038.0, + "logps/rejected": -746.0, + "loss": 0.4443, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.41796875, + "rewards/margins": 4.671875, + "rewards/rejected": -3.25390625, + "step": 911 + }, + { + "epoch": 0.18096135721017909, + "grad_norm": 40.70350324292238, + "learning_rate": 9.822398196904538e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 4.125, + "logps/chosen": -674.5, + "logps/rejected": -607.5, + "loss": 0.5381, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.32958984375, + "rewards/margins": 11.2265625, + "rewards/rejected": -9.859375, + "step": 912 + }, + { + "epoch": 0.18115977975097972, + "grad_norm": 46.02244920341313, + "learning_rate": 9.8215302107062e-07, + "logits/chosen": 3.84375, + "logits/rejected": 4.0, + "logps/chosen": -1379.0, + "logps/rejected": -965.5, + "loss": 0.4516, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.73046875, + "rewards/margins": 6.359375, + "rewards/rejected": -4.62109375, + "step": 913 + }, + { + "epoch": 0.18135820229178035, + "grad_norm": 40.23034372544256, + "learning_rate": 9.820660151546582e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.2109375, + "logps/chosen": -1044.5, + "logps/rejected": -674.5, + "loss": 0.5235, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5911865234375, + "rewards/margins": 4.9765625, + "rewards/rejected": -4.38671875, + "step": 914 + }, + { + "epoch": 0.18155662483258098, + "grad_norm": 37.68613714083773, + "learning_rate": 9.81978801984304e-07, + "logits/chosen": 3.60546875, + "logits/rejected": 4.07421875, + "logps/chosen": -1045.0, + "logps/rejected": -1458.0, + "loss": 0.3696, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.158203125, + "rewards/margins": 7.10546875, + "rewards/rejected": -4.953125, + "step": 915 + }, + { + "epoch": 0.1817550473733816, + "grad_norm": 45.26634398264257, + "learning_rate": 9.81891381601392e-07, + "logits/chosen": 3.87109375, + "logits/rejected": 3.828125, + "logps/chosen": -949.0, + "logps/rejected": -649.0, + "loss": 0.518, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.6015625, + "rewards/margins": 5.5859375, + "rewards/rejected": -3.98828125, + "step": 916 + }, + { + "epoch": 0.18195346991418226, + "grad_norm": 36.08518755189789, + "learning_rate": 9.818037540478556e-07, + "logits/chosen": 4.28515625, + "logits/rejected": 3.984375, + "logps/chosen": -1030.0, + "logps/rejected": -1027.0, + "loss": 0.404, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.3671875, + "rewards/margins": 5.3125, + "rewards/rejected": -3.9296875, + "step": 917 + }, + { + "epoch": 0.1821518924549829, + "grad_norm": 43.1101070328276, + "learning_rate": 9.81715919365729e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 3.8828125, + "logps/chosen": -1305.0, + "logps/rejected": -930.0, + "loss": 0.3833, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.30078125, + "rewards/margins": 6.296875, + "rewards/rejected": -3.99609375, + "step": 918 + }, + { + "epoch": 0.18235031499578352, + "grad_norm": 37.96175043041285, + "learning_rate": 9.816278775971445e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 3.953125, + "logps/chosen": -676.0, + "logps/rejected": -600.0, + "loss": 0.5271, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.4150390625, + "rewards/margins": 4.1640625, + "rewards/rejected": -2.75, + "step": 919 + }, + { + "epoch": 0.18254873753658415, + "grad_norm": 37.04263283711974, + "learning_rate": 9.81539628784334e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.1796875, + "logps/chosen": -1176.0, + "logps/rejected": -693.0, + "loss": 0.4814, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.666015625, + "rewards/margins": 4.87890625, + "rewards/rejected": -3.21484375, + "step": 920 + }, + { + "epoch": 0.18274716007738478, + "grad_norm": 43.46899545125752, + "learning_rate": 9.814511729696295e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.14453125, + "logps/chosen": -879.5, + "logps/rejected": -595.0, + "loss": 0.4804, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.4404296875, + "rewards/margins": 4.984375, + "rewards/rejected": -3.55078125, + "step": 921 + }, + { + "epoch": 0.18294558261818542, + "grad_norm": 37.43562532135117, + "learning_rate": 9.813625101954613e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 4.06640625, + "logps/chosen": -993.0, + "logps/rejected": -578.0, + "loss": 0.4584, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.3203125, + "rewards/margins": 4.48046875, + "rewards/rejected": -3.1640625, + "step": 922 + }, + { + "epoch": 0.18314400515898607, + "grad_norm": 38.688116744446155, + "learning_rate": 9.812736405043591e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 4.1796875, + "logps/chosen": -1029.0, + "logps/rejected": -730.0, + "loss": 0.4781, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.69140625, + "rewards/margins": 5.11328125, + "rewards/rejected": -3.43359375, + "step": 923 + }, + { + "epoch": 0.1833424276997867, + "grad_norm": 40.3726572203701, + "learning_rate": 9.811845639389526e-07, + "logits/chosen": 4.484375, + "logits/rejected": 4.453125, + "logps/chosen": -1042.0, + "logps/rejected": -730.5, + "loss": 0.4702, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.6884765625, + "rewards/margins": 4.92578125, + "rewards/rejected": -3.25, + "step": 924 + }, + { + "epoch": 0.18354085024058733, + "grad_norm": 42.89616236035223, + "learning_rate": 9.8109528054197e-07, + "logits/chosen": 4.14453125, + "logits/rejected": 4.1796875, + "logps/chosen": -950.0, + "logps/rejected": -550.5, + "loss": 0.397, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.6923828125, + "rewards/margins": 5.16015625, + "rewards/rejected": -3.478515625, + "step": 925 + }, + { + "epoch": 0.18373927278138796, + "grad_norm": 41.01014629602182, + "learning_rate": 9.810057903562388e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.3828125, + "logps/chosen": -879.0, + "logps/rejected": -941.0, + "loss": 0.4867, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.4990234375, + "rewards/margins": 5.69921875, + "rewards/rejected": -4.203125, + "step": 926 + }, + { + "epoch": 0.1839376953221886, + "grad_norm": 41.126987392877204, + "learning_rate": 9.809160934246863e-07, + "logits/chosen": 3.54296875, + "logits/rejected": 3.54296875, + "logps/chosen": -1100.0, + "logps/rejected": -806.5, + "loss": 0.5001, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.4677734375, + "rewards/margins": 4.9921875, + "rewards/rejected": -3.525390625, + "step": 927 + }, + { + "epoch": 0.18413611786298922, + "grad_norm": 33.36739201202468, + "learning_rate": 9.80826189790338e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 3.8125, + "logps/chosen": -1179.0, + "logps/rejected": -1841.0, + "loss": 0.4567, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3828125, + "rewards/margins": 8.0234375, + "rewards/rejected": -5.6328125, + "step": 928 + }, + { + "epoch": 0.18433454040378988, + "grad_norm": 32.439278379014915, + "learning_rate": 9.807360794963195e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 3.9609375, + "logps/chosen": -1011.0, + "logps/rejected": -653.5, + "loss": 0.4301, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.994140625, + "rewards/margins": 5.0625, + "rewards/rejected": -3.068359375, + "step": 929 + }, + { + "epoch": 0.1845329629445905, + "grad_norm": 37.81726036639921, + "learning_rate": 9.80645762585855e-07, + "logits/chosen": 3.984375, + "logits/rejected": 3.90625, + "logps/chosen": -1407.0, + "logps/rejected": -783.0, + "loss": 0.3663, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.931640625, + "rewards/margins": 5.7109375, + "rewards/rejected": -3.77734375, + "step": 930 + }, + { + "epoch": 0.18473138548539114, + "grad_norm": 38.636868760749074, + "learning_rate": 9.80555239102268e-07, + "logits/chosen": 3.46875, + "logits/rejected": 3.640625, + "logps/chosen": -1300.0, + "logps/rejected": -1443.0, + "loss": 0.3075, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.33984375, + "rewards/margins": 7.640625, + "rewards/rejected": -5.3046875, + "step": 931 + }, + { + "epoch": 0.18492980802619177, + "grad_norm": 41.062279875740806, + "learning_rate": 9.804645090889805e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.94140625, + "logps/chosen": -755.0, + "logps/rejected": -756.0, + "loss": 0.5022, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.349609375, + "rewards/margins": 4.4296875, + "rewards/rejected": -3.08203125, + "step": 932 + }, + { + "epoch": 0.1851282305669924, + "grad_norm": 40.66846616698616, + "learning_rate": 9.803735725895146e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.1953125, + "logps/chosen": -2044.0, + "logps/rejected": -1048.0, + "loss": 0.4912, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.537109375, + "rewards/margins": 3.39453125, + "rewards/rejected": -3.94921875, + "step": 933 + }, + { + "epoch": 0.18532665310779303, + "grad_norm": 50.52889594857158, + "learning_rate": 9.802824296474907e-07, + "logits/chosen": 3.46484375, + "logits/rejected": 3.9296875, + "logps/chosen": -912.0, + "logps/rejected": -806.0, + "loss": 0.5336, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.392578125, + "rewards/margins": 5.21484375, + "rewards/rejected": -3.8359375, + "step": 934 + }, + { + "epoch": 0.1855250756485937, + "grad_norm": 35.12773593073634, + "learning_rate": 9.801910803066287e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 4.02734375, + "logps/chosen": -984.0, + "logps/rejected": -1013.0, + "loss": 0.6153, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.357421875, + "rewards/margins": 4.92578125, + "rewards/rejected": -3.572265625, + "step": 935 + }, + { + "epoch": 0.18572349818939432, + "grad_norm": 35.12635645757832, + "learning_rate": 9.800995246107469e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.546875, + "logps/chosen": -778.0, + "logps/rejected": -1091.5, + "loss": 0.6014, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.505859375, + "rewards/margins": 3.984375, + "rewards/rejected": -2.48046875, + "step": 936 + }, + { + "epoch": 0.18592192073019495, + "grad_norm": 35.28658160469098, + "learning_rate": 9.800077626037633e-07, + "logits/chosen": 4.39453125, + "logits/rejected": 4.1953125, + "logps/chosen": -1423.0, + "logps/rejected": -862.0, + "loss": 0.465, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.109375, + "rewards/margins": 5.4609375, + "rewards/rejected": -3.361328125, + "step": 937 + }, + { + "epoch": 0.18612034327099558, + "grad_norm": 31.502802208333254, + "learning_rate": 9.799157943296943e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 3.984375, + "logps/chosen": -1162.0, + "logps/rejected": -882.0, + "loss": 0.307, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.486328125, + "rewards/margins": 7.015625, + "rewards/rejected": -4.53125, + "step": 938 + }, + { + "epoch": 0.1863187658117962, + "grad_norm": 32.074141763192046, + "learning_rate": 9.798236198326554e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.20703125, + "logps/chosen": -1226.0, + "logps/rejected": -1540.0, + "loss": 0.3801, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4052734375, + "rewards/margins": 7.2421875, + "rewards/rejected": -4.8203125, + "step": 939 + }, + { + "epoch": 0.18651718835259687, + "grad_norm": 36.35927843454932, + "learning_rate": 9.79731239156861e-07, + "logits/chosen": 4.546875, + "logits/rejected": 4.46875, + "logps/chosen": -1102.0, + "logps/rejected": -697.5, + "loss": 0.4118, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.96484375, + "rewards/margins": 6.1953125, + "rewards/rejected": -4.23828125, + "step": 940 + }, + { + "epoch": 0.1867156108933975, + "grad_norm": 38.187462484534315, + "learning_rate": 9.796386523466249e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.4921875, + "logps/chosen": -775.5, + "logps/rejected": -657.0, + "loss": 0.5898, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.072509765625, + "rewards/margins": 4.421875, + "rewards/rejected": -3.34375, + "step": 941 + }, + { + "epoch": 0.18691403343419813, + "grad_norm": 41.07435882826704, + "learning_rate": 9.79545859446359e-07, + "logits/chosen": 3.671875, + "logits/rejected": 3.890625, + "logps/chosen": -972.0, + "logps/rejected": -865.0, + "loss": 0.4573, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.802734375, + "rewards/margins": 5.90234375, + "rewards/rejected": -4.09375, + "step": 942 + }, + { + "epoch": 0.18711245597499876, + "grad_norm": 33.5159833823677, + "learning_rate": 9.794528605005742e-07, + "logits/chosen": 3.66796875, + "logits/rejected": 4.015625, + "logps/chosen": -633.0, + "logps/rejected": -584.5, + "loss": 0.5456, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.841796875, + "rewards/margins": 4.728515625, + "rewards/rejected": -3.890625, + "step": 943 + }, + { + "epoch": 0.1873108785157994, + "grad_norm": 35.76300880499827, + "learning_rate": 9.79359655553881e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.19921875, + "logps/chosen": -943.0, + "logps/rejected": -767.0, + "loss": 0.5337, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.65234375, + "rewards/margins": 4.765625, + "rewards/rejected": -3.103515625, + "step": 944 + }, + { + "epoch": 0.18750930105660002, + "grad_norm": 33.8873406167711, + "learning_rate": 9.792662446509876e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.265625, + "logps/chosen": -906.0, + "logps/rejected": -616.5, + "loss": 0.4279, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.8955078125, + "rewards/margins": 5.60546875, + "rewards/rejected": -3.724609375, + "step": 945 + }, + { + "epoch": 0.18770772359740068, + "grad_norm": 34.162125868863995, + "learning_rate": 9.791726278367021e-07, + "logits/chosen": 4.0, + "logits/rejected": 3.86328125, + "logps/chosen": -1270.0, + "logps/rejected": -799.0, + "loss": 0.4914, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.01171875, + "rewards/margins": 5.9765625, + "rewards/rejected": -3.98046875, + "step": 946 + }, + { + "epoch": 0.1879061461382013, + "grad_norm": 33.77115983853969, + "learning_rate": 9.790788051559305e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 4.4296875, + "logps/chosen": -1096.0, + "logps/rejected": -1355.0, + "loss": 0.3436, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.756591796875, + "rewards/margins": 7.12890625, + "rewards/rejected": -5.37109375, + "step": 947 + }, + { + "epoch": 0.18810456867900194, + "grad_norm": 37.86465119548621, + "learning_rate": 9.789847766536777e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.265625, + "logps/chosen": -906.0, + "logps/rejected": -733.0, + "loss": 0.4444, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.474609375, + "rewards/margins": 5.30859375, + "rewards/rejected": -3.828125, + "step": 948 + }, + { + "epoch": 0.18830299121980257, + "grad_norm": 31.374752374887123, + "learning_rate": 9.788905423750478e-07, + "logits/chosen": 4.0, + "logits/rejected": 3.8984375, + "logps/chosen": -1401.0, + "logps/rejected": -571.5, + "loss": 0.4895, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.26953125, + "rewards/margins": 3.8076171875, + "rewards/rejected": -3.533203125, + "step": 949 + }, + { + "epoch": 0.1885014137606032, + "grad_norm": 37.0683192789802, + "learning_rate": 9.787961023652433e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.4296875, + "logps/chosen": -870.5, + "logps/rejected": -684.5, + "loss": 0.4256, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.564453125, + "rewards/margins": 5.11328125, + "rewards/rejected": -3.546875, + "step": 950 + }, + { + "epoch": 0.18869983630140383, + "grad_norm": 36.98643415776474, + "learning_rate": 9.787014566695652e-07, + "logits/chosen": 3.68359375, + "logits/rejected": 3.56640625, + "logps/chosen": -900.5, + "logps/rejected": -549.0, + "loss": 0.4919, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.76171875, + "rewards/margins": 3.75390625, + "rewards/rejected": -2.98828125, + "step": 951 + }, + { + "epoch": 0.1888982588422045, + "grad_norm": 40.478088542134124, + "learning_rate": 9.786066053334137e-07, + "logits/chosen": 3.74609375, + "logits/rejected": 3.64453125, + "logps/chosen": -1065.0, + "logps/rejected": -712.0, + "loss": 0.4376, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.91015625, + "rewards/margins": 5.8671875, + "rewards/rejected": -3.94921875, + "step": 952 + }, + { + "epoch": 0.18909668138300512, + "grad_norm": 45.82195110145466, + "learning_rate": 9.785115484022869e-07, + "logits/chosen": 3.953125, + "logits/rejected": 3.68359375, + "logps/chosen": -1045.0, + "logps/rejected": -904.5, + "loss": 0.3926, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.58203125, + "rewards/margins": 6.4921875, + "rewards/rejected": -4.91015625, + "step": 953 + }, + { + "epoch": 0.18929510392380575, + "grad_norm": 42.84951857962382, + "learning_rate": 9.78416285921782e-07, + "logits/chosen": 3.96875, + "logits/rejected": 4.19140625, + "logps/chosen": -1037.0, + "logps/rejected": -606.5, + "loss": 0.6541, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18768310546875, + "rewards/margins": 2.40625, + "rewards/rejected": -2.5888671875, + "step": 954 + }, + { + "epoch": 0.18949352646460638, + "grad_norm": 45.72388923431649, + "learning_rate": 9.78320817937595e-07, + "logits/chosen": 3.73828125, + "logits/rejected": 3.80078125, + "logps/chosen": -1079.0, + "logps/rejected": -1303.0, + "loss": 0.5068, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.15045166015625, + "rewards/margins": 5.29296875, + "rewards/rejected": -4.1484375, + "step": 955 + }, + { + "epoch": 0.189691949005407, + "grad_norm": 43.5102126354342, + "learning_rate": 9.7822514449552e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 4.09375, + "logps/chosen": -1122.0, + "logps/rejected": -834.0, + "loss": 0.555, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.4111328125, + "rewards/margins": 4.6015625, + "rewards/rejected": -3.193359375, + "step": 956 + }, + { + "epoch": 0.18989037154620764, + "grad_norm": 48.175309410746834, + "learning_rate": 9.781292656414498e-07, + "logits/chosen": 3.76171875, + "logits/rejected": 3.8125, + "logps/chosen": -1003.0, + "logps/rejected": -718.0, + "loss": 0.4925, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8828125, + "rewards/margins": 5.01953125, + "rewards/rejected": -4.13671875, + "step": 957 + }, + { + "epoch": 0.1900887940870083, + "grad_norm": 38.54730221047776, + "learning_rate": 9.780331814213758e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 4.02734375, + "logps/chosen": -1062.0, + "logps/rejected": -834.0, + "loss": 0.4921, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.697265625, + "rewards/margins": 4.859375, + "rewards/rejected": -3.16796875, + "step": 958 + }, + { + "epoch": 0.19028721662780892, + "grad_norm": 31.878681298404477, + "learning_rate": 9.779368918813883e-07, + "logits/chosen": 4.3515625, + "logits/rejected": 4.1328125, + "logps/chosen": -996.0, + "logps/rejected": -679.5, + "loss": 0.3569, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.5908203125, + "rewards/margins": 5.75, + "rewards/rejected": -4.15625, + "step": 959 + }, + { + "epoch": 0.19048563916860956, + "grad_norm": 36.92777912175165, + "learning_rate": 9.778403970676748e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.02734375, + "logps/chosen": -1097.0, + "logps/rejected": -990.0, + "loss": 0.4394, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.3115234375, + "rewards/margins": 6.84375, + "rewards/rejected": -5.53125, + "step": 960 + }, + { + "epoch": 0.19068406170941019, + "grad_norm": 48.84193656957536, + "learning_rate": 9.777436970265231e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 3.97265625, + "logps/chosen": -833.5, + "logps/rejected": -640.0, + "loss": 0.5842, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.1474609375, + "rewards/margins": 3.69140625, + "rewards/rejected": -2.544921875, + "step": 961 + }, + { + "epoch": 0.19088248425021082, + "grad_norm": 42.825410881926, + "learning_rate": 9.776467918043177e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.23046875, + "logps/chosen": -1887.0, + "logps/rejected": -996.0, + "loss": 0.4087, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.12890625, + "rewards/margins": 5.8046875, + "rewards/rejected": -4.671875, + "step": 962 + }, + { + "epoch": 0.19108090679101145, + "grad_norm": 41.11777773258664, + "learning_rate": 9.775496814475429e-07, + "logits/chosen": 3.64453125, + "logits/rejected": 3.61328125, + "logps/chosen": -606.0, + "logps/rejected": -569.0, + "loss": 0.576, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.747802734375, + "rewards/margins": 4.58203125, + "rewards/rejected": -3.8203125, + "step": 963 + }, + { + "epoch": 0.1912793293318121, + "grad_norm": 34.60008843446018, + "learning_rate": 9.774523660027806e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 3.73046875, + "logps/chosen": -1068.0, + "logps/rejected": -1135.0, + "loss": 0.4544, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.8310546875, + "rewards/margins": 7.03125, + "rewards/rejected": -5.20703125, + "step": 964 + }, + { + "epoch": 0.19147775187261273, + "grad_norm": 36.17658028238377, + "learning_rate": 9.773548455167112e-07, + "logits/chosen": 4.5859375, + "logits/rejected": 4.6015625, + "logps/chosen": -1208.0, + "logps/rejected": -1084.0, + "loss": 0.405, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0546875, + "rewards/margins": 6.375, + "rewards/rejected": -4.3203125, + "step": 965 + }, + { + "epoch": 0.19167617441341336, + "grad_norm": 33.76899299523928, + "learning_rate": 9.772571200361137e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 4.2109375, + "logps/chosen": -793.0, + "logps/rejected": -515.5, + "loss": 0.5508, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.3681640625, + "rewards/margins": 4.47265625, + "rewards/rejected": -3.10546875, + "step": 966 + }, + { + "epoch": 0.191874596954214, + "grad_norm": 37.204025798702105, + "learning_rate": 9.771591896078652e-07, + "logits/chosen": 3.51953125, + "logits/rejected": 3.6171875, + "logps/chosen": -1390.0, + "logps/rejected": -799.0, + "loss": 0.4239, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.216796875, + "rewards/margins": 6.15234375, + "rewards/rejected": -3.94140625, + "step": 967 + }, + { + "epoch": 0.19207301949501462, + "grad_norm": 51.65545713544031, + "learning_rate": 9.770610542789412e-07, + "logits/chosen": 4.0625, + "logits/rejected": 4.1640625, + "logps/chosen": -856.0, + "logps/rejected": -871.0, + "loss": 0.5768, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.548828125, + "rewards/margins": 5.609375, + "rewards/rejected": -4.056640625, + "step": 968 + }, + { + "epoch": 0.19227144203581528, + "grad_norm": 39.07407129341576, + "learning_rate": 9.769627140964154e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.24609375, + "logps/chosen": -972.0, + "logps/rejected": -822.0, + "loss": 0.4854, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.97900390625, + "rewards/margins": 5.35546875, + "rewards/rejected": -4.37890625, + "step": 969 + }, + { + "epoch": 0.1924698645766159, + "grad_norm": 35.133035343866936, + "learning_rate": 9.7686416910746e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.87890625, + "logps/chosen": -1067.0, + "logps/rejected": -658.5, + "loss": 0.4749, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.8203125, + "rewards/margins": 5.57421875, + "rewards/rejected": -3.748046875, + "step": 970 + }, + { + "epoch": 0.19266828711741654, + "grad_norm": 33.17179983723474, + "learning_rate": 9.767654193593452e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.33203125, + "logps/chosen": -639.0, + "logps/rejected": -757.5, + "loss": 0.6275, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.71435546875, + "rewards/margins": 4.810546875, + "rewards/rejected": -4.1044921875, + "step": 971 + }, + { + "epoch": 0.19286670965821717, + "grad_norm": 42.74828283754365, + "learning_rate": 9.766664648994393e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.26171875, + "logps/chosen": -1197.0, + "logps/rejected": -807.0, + "loss": 0.45, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8408203125, + "rewards/margins": 6.5078125, + "rewards/rejected": -4.6640625, + "step": 972 + }, + { + "epoch": 0.1930651321990178, + "grad_norm": 47.05874325890061, + "learning_rate": 9.765673057752092e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.22265625, + "logps/chosen": -1094.0, + "logps/rejected": -1041.0, + "loss": 0.5292, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.1005859375, + "rewards/margins": 7.625, + "rewards/rejected": -6.5234375, + "step": 973 + }, + { + "epoch": 0.19326355473981843, + "grad_norm": 34.15273342421655, + "learning_rate": 9.7646794203422e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 4.09375, + "logps/chosen": -1077.0, + "logps/rejected": -768.0, + "loss": 0.4486, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.15625, + "rewards/margins": 5.94921875, + "rewards/rejected": -3.798828125, + "step": 974 + }, + { + "epoch": 0.1934619772806191, + "grad_norm": 42.94846839428143, + "learning_rate": 9.763683737241342e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 3.9296875, + "logps/chosen": -784.0, + "logps/rejected": -750.0, + "loss": 0.5133, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.216796875, + "rewards/margins": 14.34765625, + "rewards/rejected": -13.11328125, + "step": 975 + }, + { + "epoch": 0.19366039982141972, + "grad_norm": 37.137432621629166, + "learning_rate": 9.762686008927133e-07, + "logits/chosen": 3.72265625, + "logits/rejected": 3.921875, + "logps/chosen": -1138.0, + "logps/rejected": -868.0, + "loss": 0.457, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.6884765625, + "rewards/margins": 6.35546875, + "rewards/rejected": -4.66796875, + "step": 976 + }, + { + "epoch": 0.19385882236222035, + "grad_norm": 43.47588788245441, + "learning_rate": 9.761686235878166e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 4.1015625, + "logps/chosen": -583.0, + "logps/rejected": -616.5, + "loss": 0.5319, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.83203125, + "rewards/margins": 4.47265625, + "rewards/rejected": -3.640625, + "step": 977 + }, + { + "epoch": 0.19405724490302098, + "grad_norm": 36.390523943891054, + "learning_rate": 9.76068441857401e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.453125, + "logps/chosen": -984.0, + "logps/rejected": -617.0, + "loss": 0.4531, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.724853515625, + "rewards/margins": 4.59765625, + "rewards/rejected": -2.869140625, + "step": 978 + }, + { + "epoch": 0.1942556674438216, + "grad_norm": 44.440263175305255, + "learning_rate": 9.759680557495226e-07, + "logits/chosen": 3.6875, + "logits/rejected": 3.80859375, + "logps/chosen": -1223.0, + "logps/rejected": -887.0, + "loss": 0.3962, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2320556640625, + "rewards/margins": 6.20703125, + "rewards/rejected": -4.96484375, + "step": 979 + }, + { + "epoch": 0.19445408998462224, + "grad_norm": 28.529415979063106, + "learning_rate": 9.758674653123344e-07, + "logits/chosen": 4.46484375, + "logits/rejected": 4.453125, + "logps/chosen": -907.5, + "logps/rejected": -685.0, + "loss": 0.4311, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.076171875, + "rewards/margins": 5.904296875, + "rewards/rejected": -3.822265625, + "step": 980 + }, + { + "epoch": 0.1946525125254229, + "grad_norm": 40.917212065963064, + "learning_rate": 9.757666705940878e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.44140625, + "logps/chosen": -726.0, + "logps/rejected": -609.5, + "loss": 0.7062, + "rewards/accuracies": 0.65625, + "rewards/chosen": 1.12060546875, + "rewards/margins": 2.169921875, + "rewards/rejected": -1.047607421875, + "step": 981 + }, + { + "epoch": 0.19485093506622353, + "grad_norm": 30.95024327934473, + "learning_rate": 9.756656716431321e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 3.9921875, + "logps/chosen": -1274.0, + "logps/rejected": -798.0, + "loss": 0.4541, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.4443359375, + "rewards/margins": 6.40625, + "rewards/rejected": -3.9609375, + "step": 982 + }, + { + "epoch": 0.19504935760702416, + "grad_norm": 38.54978969771584, + "learning_rate": 9.755644685079151e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.15234375, + "logps/chosen": -977.0, + "logps/rejected": -741.0, + "loss": 0.5077, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.435546875, + "rewards/margins": 5.6015625, + "rewards/rejected": -4.16796875, + "step": 983 + }, + { + "epoch": 0.1952477801478248, + "grad_norm": 31.96325664125493, + "learning_rate": 9.75463061236982e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 4.0390625, + "logps/chosen": -1182.0, + "logps/rejected": -1456.0, + "loss": 0.2353, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.34375, + "rewards/margins": 9.359375, + "rewards/rejected": -7.0234375, + "step": 984 + }, + { + "epoch": 0.19544620268862542, + "grad_norm": 31.853270518885026, + "learning_rate": 9.75361449878976e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 4.078125, + "logps/chosen": -752.0, + "logps/rejected": -543.0, + "loss": 0.4386, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.54296875, + "rewards/margins": 4.921875, + "rewards/rejected": -3.37109375, + "step": 985 + }, + { + "epoch": 0.19564462522942605, + "grad_norm": 47.01624225672347, + "learning_rate": 9.752596344826384e-07, + "logits/chosen": 4.390625, + "logits/rejected": 4.38671875, + "logps/chosen": -1156.0, + "logps/rejected": -663.0, + "loss": 0.4715, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.556640625, + "rewards/margins": 5.18359375, + "rewards/rejected": -3.6171875, + "step": 986 + }, + { + "epoch": 0.1958430477702267, + "grad_norm": 35.86865609728698, + "learning_rate": 9.751576150968081e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.52734375, + "logps/chosen": -890.0, + "logps/rejected": -607.5, + "loss": 0.4573, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8017578125, + "rewards/margins": 5.0, + "rewards/rejected": -3.201171875, + "step": 987 + }, + { + "epoch": 0.19604147031102734, + "grad_norm": 33.598759069532775, + "learning_rate": 9.75055391770422e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.19140625, + "logps/chosen": -1070.0, + "logps/rejected": -668.0, + "loss": 0.3397, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.8740234375, + "rewards/margins": 6.0234375, + "rewards/rejected": -4.15625, + "step": 988 + }, + { + "epoch": 0.19623989285182797, + "grad_norm": 36.171470812604376, + "learning_rate": 9.74952964552515e-07, + "logits/chosen": 3.640625, + "logits/rejected": 3.88671875, + "logps/chosen": -939.0, + "logps/rejected": -832.5, + "loss": 0.47, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6435546875, + "rewards/margins": 4.96484375, + "rewards/rejected": -3.318359375, + "step": 989 + }, + { + "epoch": 0.1964383153926286, + "grad_norm": 40.13825417838295, + "learning_rate": 9.748503334922194e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 3.8203125, + "logps/chosen": -1022.0, + "logps/rejected": -726.0, + "loss": 0.5598, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.53515625, + "rewards/margins": 4.71875, + "rewards/rejected": -3.185546875, + "step": 990 + }, + { + "epoch": 0.19663673793342923, + "grad_norm": 42.54182982701868, + "learning_rate": 9.747474986387654e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 4.0703125, + "logps/chosen": -1157.0, + "logps/rejected": -731.0, + "loss": 0.4225, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.947265625, + "rewards/margins": 6.5390625, + "rewards/rejected": -4.58984375, + "step": 991 + }, + { + "epoch": 0.19683516047422986, + "grad_norm": 35.40664915641474, + "learning_rate": 9.746444600414812e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.9375, + "logps/chosen": -837.0, + "logps/rejected": -587.0, + "loss": 0.4872, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1318359375, + "rewards/margins": 4.44140625, + "rewards/rejected": -3.306640625, + "step": 992 + }, + { + "epoch": 0.19703358301503052, + "grad_norm": 41.183116558141926, + "learning_rate": 9.745412177497928e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.890625, + "logps/chosen": -928.0, + "logps/rejected": -685.0, + "loss": 0.5267, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.74560546875, + "rewards/margins": 4.47265625, + "rewards/rejected": -3.72265625, + "step": 993 + }, + { + "epoch": 0.19723200555583115, + "grad_norm": 43.60250594030617, + "learning_rate": 9.744377718132233e-07, + "logits/chosen": 3.984375, + "logits/rejected": 4.19140625, + "logps/chosen": -861.0, + "logps/rejected": -1453.0, + "loss": 0.4759, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.169921875, + "rewards/margins": 6.84375, + "rewards/rejected": -4.697265625, + "step": 994 + }, + { + "epoch": 0.19743042809663178, + "grad_norm": 44.76131248710307, + "learning_rate": 9.743341222813941e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.05078125, + "logps/chosen": -1164.0, + "logps/rejected": -792.0, + "loss": 0.3903, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.009765625, + "rewards/margins": 5.7421875, + "rewards/rejected": -3.7265625, + "step": 995 + }, + { + "epoch": 0.1976288506374324, + "grad_norm": 39.216132973676615, + "learning_rate": 9.742302692040241e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 4.19140625, + "logps/chosen": -1146.0, + "logps/rejected": -612.5, + "loss": 0.4865, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.966796875, + "rewards/margins": 5.1953125, + "rewards/rejected": -3.23046875, + "step": 996 + }, + { + "epoch": 0.19782727317823304, + "grad_norm": 41.679633346057514, + "learning_rate": 9.741262126309296e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.1953125, + "logps/chosen": -765.0, + "logps/rejected": -1430.5, + "loss": 0.4694, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.841064453125, + "rewards/margins": 6.18359375, + "rewards/rejected": -5.341796875, + "step": 997 + }, + { + "epoch": 0.1980256957190337, + "grad_norm": 38.8056960424125, + "learning_rate": 9.740219526120248e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 4.16796875, + "logps/chosen": -1327.0, + "logps/rejected": -913.0, + "loss": 0.3463, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0712890625, + "rewards/margins": 7.15625, + "rewards/rejected": -5.08984375, + "step": 998 + }, + { + "epoch": 0.19822411825983433, + "grad_norm": 36.27134780824437, + "learning_rate": 9.739174891973213e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.0546875, + "logps/chosen": -1239.0, + "logps/rejected": -695.0, + "loss": 0.4533, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.60491943359375, + "rewards/margins": 4.9375, + "rewards/rejected": -3.33984375, + "step": 999 + }, + { + "epoch": 0.19842254080063496, + "grad_norm": 45.38137981160806, + "learning_rate": 9.738128224369284e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 3.96875, + "logps/chosen": -1076.0, + "logps/rejected": -698.0, + "loss": 0.51, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.9921875, + "rewards/margins": 5.5625, + "rewards/rejected": -3.5703125, + "step": 1000 + }, + { + "epoch": 0.19862096334143559, + "grad_norm": 41.55133238301392, + "learning_rate": 9.73707952381053e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.234375, + "logps/chosen": -534.0, + "logps/rejected": -704.0, + "loss": 0.5441, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.6083984375, + "rewards/margins": 4.40625, + "rewards/rejected": -3.796875, + "step": 1001 + }, + { + "epoch": 0.19881938588223622, + "grad_norm": 34.80322747985481, + "learning_rate": 9.736028790799993e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.20703125, + "logps/chosen": -830.0, + "logps/rejected": -729.75, + "loss": 0.4809, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.3515625, + "rewards/margins": 6.16796875, + "rewards/rejected": -4.828125, + "step": 1002 + }, + { + "epoch": 0.19901780842303685, + "grad_norm": 33.11646775167735, + "learning_rate": 9.734976025841688e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.01171875, + "logps/chosen": -931.0, + "logps/rejected": -891.0, + "loss": 0.4646, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.45751953125, + "rewards/margins": 6.30859375, + "rewards/rejected": -4.8359375, + "step": 1003 + }, + { + "epoch": 0.1992162309638375, + "grad_norm": 28.741183022648965, + "learning_rate": 9.733921229440615e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.12890625, + "logps/chosen": -1089.0, + "logps/rejected": -1120.0, + "loss": 0.5049, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7812042236328125, + "rewards/margins": 5.68359375, + "rewards/rejected": -4.8828125, + "step": 1004 + }, + { + "epoch": 0.19941465350463813, + "grad_norm": 36.23324088465678, + "learning_rate": 9.732864402102732e-07, + "logits/chosen": 3.50390625, + "logits/rejected": 3.37109375, + "logps/chosen": -961.0, + "logps/rejected": -535.0, + "loss": 0.3941, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.1533203125, + "rewards/margins": 5.3828125, + "rewards/rejected": -4.23046875, + "step": 1005 + }, + { + "epoch": 0.19961307604543876, + "grad_norm": 43.90676464818714, + "learning_rate": 9.731805544334986e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.20703125, + "logps/chosen": -1102.0, + "logps/rejected": -1267.0, + "loss": 0.4126, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.5341796875, + "rewards/margins": 8.125, + "rewards/rejected": -6.58984375, + "step": 1006 + }, + { + "epoch": 0.1998114985862394, + "grad_norm": 37.41909348638878, + "learning_rate": 9.730744656645294e-07, + "logits/chosen": 4.37890625, + "logits/rejected": 4.39453125, + "logps/chosen": -1330.0, + "logps/rejected": -915.0, + "loss": 0.412, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.5458984375, + "rewards/margins": 7.0859375, + "rewards/rejected": -5.52734375, + "step": 1007 + }, + { + "epoch": 0.20000992112704002, + "grad_norm": 34.91061255559916, + "learning_rate": 9.729681739542539e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 4.2890625, + "logps/chosen": -1114.0, + "logps/rejected": -854.0, + "loss": 0.4768, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.6640625, + "rewards/margins": 5.59375, + "rewards/rejected": -3.91796875, + "step": 1008 + }, + { + "epoch": 0.20020834366784065, + "grad_norm": 33.44092466442902, + "learning_rate": 9.728616793536587e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.30078125, + "logps/chosen": -1247.0, + "logps/rejected": -887.0, + "loss": 0.5094, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.9033203125, + "rewards/margins": 8.3046875, + "rewards/rejected": -6.41015625, + "step": 1009 + }, + { + "epoch": 0.2004067662086413, + "grad_norm": 38.49501268154352, + "learning_rate": 9.727549819138273e-07, + "logits/chosen": 3.76171875, + "logits/rejected": 4.0390625, + "logps/chosen": -1087.0, + "logps/rejected": -840.0, + "loss": 0.4795, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.85546875, + "rewards/margins": 5.1875, + "rewards/rejected": -3.328125, + "step": 1010 + }, + { + "epoch": 0.20060518874944194, + "grad_norm": 31.39210220706726, + "learning_rate": 9.726480816859407e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.18359375, + "logps/chosen": -1186.0, + "logps/rejected": -596.0, + "loss": 0.5775, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.8369140625, + "rewards/margins": 4.5625, + "rewards/rejected": -3.72265625, + "step": 1011 + }, + { + "epoch": 0.20080361129024257, + "grad_norm": 34.82342441819763, + "learning_rate": 9.72540978721277e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.3828125, + "logps/chosen": -936.0, + "logps/rejected": -668.0, + "loss": 0.376, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.0625, + "rewards/margins": 6.1796875, + "rewards/rejected": -4.12109375, + "step": 1012 + }, + { + "epoch": 0.2010020338310432, + "grad_norm": 31.348075972159595, + "learning_rate": 9.724336730712114e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 3.9453125, + "logps/chosen": -1126.0, + "logps/rejected": -504.0, + "loss": 0.4327, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.828125, + "rewards/margins": 5.234375, + "rewards/rejected": -3.40234375, + "step": 1013 + }, + { + "epoch": 0.20120045637184383, + "grad_norm": 39.989181025827904, + "learning_rate": 9.723261647872166e-07, + "logits/chosen": 4.26171875, + "logits/rejected": 4.296875, + "logps/chosen": -853.0, + "logps/rejected": -564.5, + "loss": 0.4999, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.3046875, + "rewards/margins": 4.82421875, + "rewards/rejected": -3.525390625, + "step": 1014 + }, + { + "epoch": 0.20139887891264446, + "grad_norm": 54.62482851302131, + "learning_rate": 9.722184539208626e-07, + "logits/chosen": 4.4921875, + "logits/rejected": 4.4609375, + "logps/chosen": -1306.0, + "logps/rejected": -921.0, + "loss": 0.4977, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.2724609375, + "rewards/margins": 6.2109375, + "rewards/rejected": -4.94921875, + "step": 1015 + }, + { + "epoch": 0.20159730145344512, + "grad_norm": 39.04193851943587, + "learning_rate": 9.72110540523816e-07, + "logits/chosen": 3.71484375, + "logits/rejected": 3.9765625, + "logps/chosen": -962.0, + "logps/rejected": -719.5, + "loss": 0.5183, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.607421875, + "rewards/margins": 4.3984375, + "rewards/rejected": -2.7890625, + "step": 1016 + }, + { + "epoch": 0.20179572399424575, + "grad_norm": 32.50343040807051, + "learning_rate": 9.720024246478414e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.984375, + "logps/chosen": -951.0, + "logps/rejected": -645.0, + "loss": 0.3933, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.033203125, + "rewards/margins": 5.359375, + "rewards/rejected": -3.32421875, + "step": 1017 + }, + { + "epoch": 0.20199414653504638, + "grad_norm": 38.61033266312955, + "learning_rate": 9.718941063447996e-07, + "logits/chosen": 4.0, + "logits/rejected": 4.078125, + "logps/chosen": -946.0, + "logps/rejected": -717.5, + "loss": 0.4259, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.875, + "rewards/margins": 5.84375, + "rewards/rejected": -3.96484375, + "step": 1018 + }, + { + "epoch": 0.202192569075847, + "grad_norm": 37.39297039506301, + "learning_rate": 9.717855856666493e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 4.06640625, + "logps/chosen": -972.0, + "logps/rejected": -683.5, + "loss": 0.4499, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.6220703125, + "rewards/margins": 4.703125, + "rewards/rejected": -3.08203125, + "step": 1019 + }, + { + "epoch": 0.20239099161664764, + "grad_norm": 41.772955333207946, + "learning_rate": 9.716768626654459e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.77734375, + "logps/chosen": -1056.0, + "logps/rejected": -675.0, + "loss": 0.5353, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.4951171875, + "rewards/margins": 4.2890625, + "rewards/rejected": -2.791015625, + "step": 1020 + }, + { + "epoch": 0.2025894141574483, + "grad_norm": 36.757274045303234, + "learning_rate": 9.715679373933418e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.3046875, + "logps/chosen": -1032.0, + "logps/rejected": -662.25, + "loss": 0.4014, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.62890625, + "rewards/margins": 5.75390625, + "rewards/rejected": -4.125, + "step": 1021 + }, + { + "epoch": 0.20278783669824893, + "grad_norm": 40.73153637012608, + "learning_rate": 9.714588099025866e-07, + "logits/chosen": 4.5234375, + "logits/rejected": 4.2734375, + "logps/chosen": -870.0, + "logps/rejected": -565.0, + "loss": 0.4452, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.496826171875, + "rewards/margins": 4.8828125, + "rewards/rejected": -3.37890625, + "step": 1022 + }, + { + "epoch": 0.20298625923904956, + "grad_norm": 34.25201505059987, + "learning_rate": 9.71349480245527e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 4.21484375, + "logps/chosen": -1148.0, + "logps/rejected": -950.0, + "loss": 0.4133, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.74609375, + "rewards/margins": 7.203125, + "rewards/rejected": -5.4609375, + "step": 1023 + }, + { + "epoch": 0.2031846817798502, + "grad_norm": 35.163349267229485, + "learning_rate": 9.712399484746059e-07, + "logits/chosen": 4.14453125, + "logits/rejected": 4.28125, + "logps/chosen": -1254.0, + "logps/rejected": -1417.0, + "loss": 0.3806, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.400390625, + "rewards/margins": 7.9375, + "rewards/rejected": -5.546875, + "step": 1024 + }, + { + "epoch": 0.20338310432065082, + "grad_norm": 51.65806722644946, + "learning_rate": 9.711302146423642e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 3.89453125, + "logps/chosen": -1147.0, + "logps/rejected": -668.5, + "loss": 0.3869, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.4619140625, + "rewards/margins": 6.4609375, + "rewards/rejected": -4.984375, + "step": 1025 + }, + { + "epoch": 0.20358152686145145, + "grad_norm": 35.63407070516757, + "learning_rate": 9.710202788014393e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 3.97265625, + "logps/chosen": -890.0, + "logps/rejected": -1358.0, + "loss": 0.4092, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.76171875, + "rewards/margins": 7.1875, + "rewards/rejected": -5.421875, + "step": 1026 + }, + { + "epoch": 0.2037799494022521, + "grad_norm": 32.74786511564433, + "learning_rate": 9.709101410045652e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.4765625, + "logps/chosen": -943.0, + "logps/rejected": -764.0, + "loss": 0.5785, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.60369873046875, + "rewards/margins": 3.72265625, + "rewards/rejected": -3.12890625, + "step": 1027 + }, + { + "epoch": 0.20397837194305274, + "grad_norm": 39.52702842692453, + "learning_rate": 9.707998013045735e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 4.41015625, + "logps/chosen": -883.0, + "logps/rejected": -572.0, + "loss": 0.5661, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.26953125, + "rewards/margins": 4.169921875, + "rewards/rejected": -2.900390625, + "step": 1028 + }, + { + "epoch": 0.20417679448385337, + "grad_norm": 39.919175437122284, + "learning_rate": 9.706892597543914e-07, + "logits/chosen": 3.8515625, + "logits/rejected": 3.8984375, + "logps/chosen": -772.0, + "logps/rejected": -639.0, + "loss": 0.5354, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.33154296875, + "rewards/margins": 7.29296875, + "rewards/rejected": -5.9453125, + "step": 1029 + }, + { + "epoch": 0.204375217024654, + "grad_norm": 42.70391691423326, + "learning_rate": 9.705785164070447e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.015625, + "logps/chosen": -865.0, + "logps/rejected": -519.0, + "loss": 0.4727, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.5224609375, + "rewards/margins": 5.03515625, + "rewards/rejected": -3.51171875, + "step": 1030 + }, + { + "epoch": 0.20457363956545463, + "grad_norm": 40.71237736794132, + "learning_rate": 9.704675713156543e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 4.08984375, + "logps/chosen": -1172.0, + "logps/rejected": -1757.0, + "loss": 0.5402, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.943359375, + "rewards/margins": 6.03125, + "rewards/rejected": -4.0859375, + "step": 1031 + }, + { + "epoch": 0.20477206210625526, + "grad_norm": 44.77999130404101, + "learning_rate": 9.70356424533439e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 4.5234375, + "logps/chosen": -866.5, + "logps/rejected": -714.5, + "loss": 0.5161, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.40771484375, + "rewards/margins": 5.19921875, + "rewards/rejected": -3.79296875, + "step": 1032 + }, + { + "epoch": 0.20497048464705592, + "grad_norm": 39.3460054119232, + "learning_rate": 9.702450761137136e-07, + "logits/chosen": 4.3984375, + "logits/rejected": 4.46875, + "logps/chosen": -1202.0, + "logps/rejected": -1407.0, + "loss": 0.4361, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.513671875, + "rewards/margins": 7.59765625, + "rewards/rejected": -5.1044921875, + "step": 1033 + }, + { + "epoch": 0.20516890718785655, + "grad_norm": 40.074580821990146, + "learning_rate": 9.701335261098901e-07, + "logits/chosen": 4.61328125, + "logits/rejected": 4.6953125, + "logps/chosen": -832.0, + "logps/rejected": -719.5, + "loss": 0.4008, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.83203125, + "rewards/margins": 6.21875, + "rewards/rejected": -4.3828125, + "step": 1034 + }, + { + "epoch": 0.20536732972865718, + "grad_norm": 30.207452674941287, + "learning_rate": 9.700217745754772e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 4.12890625, + "logps/chosen": -943.0, + "logps/rejected": -598.0, + "loss": 0.4213, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.779296875, + "rewards/margins": 4.96875, + "rewards/rejected": -3.185546875, + "step": 1035 + }, + { + "epoch": 0.2055657522694578, + "grad_norm": 48.20140259982545, + "learning_rate": 9.699098215640799e-07, + "logits/chosen": 4.234375, + "logits/rejected": 3.875, + "logps/chosen": -1065.0, + "logps/rejected": -795.5, + "loss": 0.4953, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.783203125, + "rewards/margins": 10.6171875, + "rewards/rejected": -8.83203125, + "step": 1036 + }, + { + "epoch": 0.20576417481025844, + "grad_norm": 36.39243632300597, + "learning_rate": 9.697976671294003e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 4.3359375, + "logps/chosen": -923.5, + "logps/rejected": -634.5, + "loss": 0.3644, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.880859375, + "rewards/margins": 5.8671875, + "rewards/rejected": -3.9765625, + "step": 1037 + }, + { + "epoch": 0.20596259735105907, + "grad_norm": 37.09111053225541, + "learning_rate": 9.696853113252365e-07, + "logits/chosen": 4.546875, + "logits/rejected": 4.5, + "logps/chosen": -813.0, + "logps/rejected": -647.0, + "loss": 0.6191, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.8193359375, + "rewards/margins": 3.771484375, + "rewards/rejected": -1.943359375, + "step": 1038 + }, + { + "epoch": 0.20616101989185973, + "grad_norm": 38.13618346316767, + "learning_rate": 9.695727542054841e-07, + "logits/chosen": 4.0, + "logits/rejected": 3.98828125, + "logps/chosen": -822.0, + "logps/rejected": -751.0, + "loss": 0.6195, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.72119140625, + "rewards/margins": 3.939453125, + "rewards/rejected": -2.23046875, + "step": 1039 + }, + { + "epoch": 0.20635944243266036, + "grad_norm": 49.00880640908272, + "learning_rate": 9.694599958241343e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 3.88671875, + "logps/chosen": -1152.0, + "logps/rejected": -855.0, + "loss": 0.5706, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.01806640625, + "rewards/margins": 4.09375, + "rewards/rejected": -3.0703125, + "step": 1040 + }, + { + "epoch": 0.206557864973461, + "grad_norm": 29.295548055711276, + "learning_rate": 9.693470362352756e-07, + "logits/chosen": 3.734375, + "logits/rejected": 4.0703125, + "logps/chosen": -843.0, + "logps/rejected": -846.0, + "loss": 0.4904, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.400390625, + "rewards/margins": 13.0703125, + "rewards/rejected": -11.708984375, + "step": 1041 + }, + { + "epoch": 0.20675628751426162, + "grad_norm": 30.60993234917827, + "learning_rate": 9.692338754930925e-07, + "logits/chosen": 3.64453125, + "logits/rejected": 3.65625, + "logps/chosen": -978.0, + "logps/rejected": -1485.0, + "loss": 0.3934, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.24609375, + "rewards/margins": 7.6875, + "rewards/rejected": -5.4296875, + "step": 1042 + }, + { + "epoch": 0.20695471005506225, + "grad_norm": 33.869970992904875, + "learning_rate": 9.691205136518663e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.609375, + "logps/chosen": -1212.0, + "logps/rejected": -732.0, + "loss": 0.3789, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.74609375, + "rewards/margins": 6.7265625, + "rewards/rejected": -4.98828125, + "step": 1043 + }, + { + "epoch": 0.20715313259586288, + "grad_norm": 39.002186909712144, + "learning_rate": 9.690069507659748e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.28515625, + "logps/chosen": -1021.0, + "logps/rejected": -1164.0, + "loss": 0.445, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8251953125, + "rewards/margins": 6.765625, + "rewards/rejected": -4.9296875, + "step": 1044 + }, + { + "epoch": 0.20735155513666353, + "grad_norm": 40.871492205162916, + "learning_rate": 9.688931868898918e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 4.1171875, + "logps/chosen": -1191.5, + "logps/rejected": -1062.0, + "loss": 0.4478, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.923828125, + "rewards/margins": 6.30078125, + "rewards/rejected": -4.376953125, + "step": 1045 + }, + { + "epoch": 0.20754997767746416, + "grad_norm": 29.633630513915506, + "learning_rate": 9.68779222078188e-07, + "logits/chosen": 4.4375, + "logits/rejected": 4.40625, + "logps/chosen": -805.0, + "logps/rejected": -502.0, + "loss": 0.4452, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.51171875, + "rewards/margins": 6.46875, + "rewards/rejected": -4.95703125, + "step": 1046 + }, + { + "epoch": 0.2077484002182648, + "grad_norm": 45.85402625655207, + "learning_rate": 9.686650563855302e-07, + "logits/chosen": 4.078125, + "logits/rejected": 4.07421875, + "logps/chosen": -976.0, + "logps/rejected": -2147.0, + "loss": 0.5417, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.05029296875, + "rewards/margins": 6.4453125, + "rewards/rejected": -5.392578125, + "step": 1047 + }, + { + "epoch": 0.20794682275906543, + "grad_norm": 33.22303554549727, + "learning_rate": 9.68550689866682e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 4.07421875, + "logps/chosen": -1070.75, + "logps/rejected": -599.0, + "loss": 0.3856, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.87457275390625, + "rewards/margins": 5.6875, + "rewards/rejected": -3.8125, + "step": 1048 + }, + { + "epoch": 0.20814524529986606, + "grad_norm": 42.64474726591916, + "learning_rate": 9.684361225765026e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 4.09765625, + "logps/chosen": -838.0, + "logps/rejected": -630.0, + "loss": 0.5708, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.85205078125, + "rewards/margins": 3.86328125, + "rewards/rejected": -3.01953125, + "step": 1049 + }, + { + "epoch": 0.2083436678406667, + "grad_norm": 30.17027423444574, + "learning_rate": 9.68321354569948e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.1015625, + "logps/chosen": -1052.0, + "logps/rejected": -765.5, + "loss": 0.3413, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4111328125, + "rewards/margins": 6.6328125, + "rewards/rejected": -4.21875, + "step": 1050 + }, + { + "epoch": 0.20854209038146734, + "grad_norm": 39.48106953306571, + "learning_rate": 9.6820638590207e-07, + "logits/chosen": 4.0625, + "logits/rejected": 4.078125, + "logps/chosen": -1192.0, + "logps/rejected": -705.5, + "loss": 0.3755, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.146484375, + "rewards/margins": 5.65234375, + "rewards/rejected": -3.5, + "step": 1051 + }, + { + "epoch": 0.20874051292226797, + "grad_norm": 34.92712950367025, + "learning_rate": 9.680912166280175e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.5546875, + "logps/chosen": -959.0, + "logps/rejected": -585.0, + "loss": 0.4962, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.40234375, + "rewards/margins": 5.20703125, + "rewards/rejected": -3.80078125, + "step": 1052 + }, + { + "epoch": 0.2089389354630686, + "grad_norm": 39.97607161292241, + "learning_rate": 9.679758468030354e-07, + "logits/chosen": 4.0, + "logits/rejected": 4.09375, + "logps/chosen": -848.0, + "logps/rejected": -1026.5, + "loss": 0.4878, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.898193359375, + "rewards/margins": 5.6015625, + "rewards/rejected": -4.6953125, + "step": 1053 + }, + { + "epoch": 0.20913735800386923, + "grad_norm": 37.62660532757083, + "learning_rate": 9.67860276482464e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.15625, + "logps/chosen": -1009.0, + "logps/rejected": -717.5, + "loss": 0.3968, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.84375, + "rewards/margins": 6.828125, + "rewards/rejected": -4.96875, + "step": 1054 + }, + { + "epoch": 0.20933578054466986, + "grad_norm": 34.86449399656753, + "learning_rate": 9.677445057217405e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.40625, + "logps/chosen": -1172.0, + "logps/rejected": -1516.0, + "loss": 0.2707, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6953125, + "rewards/margins": 9.140625, + "rewards/rejected": -6.44921875, + "step": 1055 + }, + { + "epoch": 0.20953420308547052, + "grad_norm": 32.693356043373065, + "learning_rate": 9.676285345763983e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 3.79296875, + "logps/chosen": -1048.0, + "logps/rejected": -669.5, + "loss": 0.4093, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.001953125, + "rewards/margins": 5.359375, + "rewards/rejected": -3.359375, + "step": 1056 + }, + { + "epoch": 0.20973262562627115, + "grad_norm": 36.21347708305699, + "learning_rate": 9.675123631020663e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 4.046875, + "logps/chosen": -1346.0, + "logps/rejected": -991.0, + "loss": 0.3336, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5, + "rewards/margins": 6.8359375, + "rewards/rejected": -4.3359375, + "step": 1057 + }, + { + "epoch": 0.20993104816707178, + "grad_norm": 43.76010379912843, + "learning_rate": 9.673959913544702e-07, + "logits/chosen": 3.31640625, + "logits/rejected": 3.40625, + "logps/chosen": -678.0, + "logps/rejected": -504.0, + "loss": 0.5698, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.9814453125, + "rewards/margins": 4.0390625, + "rewards/rejected": -3.0625, + "step": 1058 + }, + { + "epoch": 0.2101294707078724, + "grad_norm": 44.180051998276134, + "learning_rate": 9.672794193894315e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.0703125, + "logps/chosen": -916.0, + "logps/rejected": -770.0, + "loss": 0.5187, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.7255859375, + "rewards/margins": 5.5859375, + "rewards/rejected": -3.86328125, + "step": 1059 + }, + { + "epoch": 0.21032789324867304, + "grad_norm": 33.29688911906085, + "learning_rate": 9.671626472628673e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 4.265625, + "logps/chosen": -961.0, + "logps/rejected": -757.0, + "loss": 0.4417, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.9453125, + "rewards/margins": 5.8515625, + "rewards/rejected": -3.8984375, + "step": 1060 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 36.5899964976218, + "learning_rate": 9.670456750307916e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.4140625, + "logps/chosen": -975.0, + "logps/rejected": -690.5, + "loss": 0.5895, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.59375, + "rewards/margins": 3.62890625, + "rewards/rejected": -3.0390625, + "step": 1061 + }, + { + "epoch": 0.21072473833027433, + "grad_norm": 34.127198877069844, + "learning_rate": 9.669285027493138e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 4.25390625, + "logps/chosen": -787.0, + "logps/rejected": -558.5, + "loss": 0.4167, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.6796875, + "rewards/margins": 6.125, + "rewards/rejected": -4.45703125, + "step": 1062 + }, + { + "epoch": 0.21092316087107496, + "grad_norm": 36.263697777393084, + "learning_rate": 9.668111304746389e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.4140625, + "logps/chosen": -934.0, + "logps/rejected": -892.0, + "loss": 0.5542, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.8406982421875, + "rewards/margins": 4.625, + "rewards/rejected": -3.78125, + "step": 1063 + }, + { + "epoch": 0.2111215834118756, + "grad_norm": 41.968671006712334, + "learning_rate": 9.666935582630687e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.14453125, + "logps/chosen": -1444.0, + "logps/rejected": -936.0, + "loss": 0.3921, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.94921875, + "rewards/margins": 7.640625, + "rewards/rejected": -4.69140625, + "step": 1064 + }, + { + "epoch": 0.21132000595267622, + "grad_norm": 37.151370940831406, + "learning_rate": 9.665757861710007e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 3.953125, + "logps/chosen": -1096.0, + "logps/rejected": -702.0, + "loss": 0.4191, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.21484375, + "rewards/margins": 6.078125, + "rewards/rejected": -3.86328125, + "step": 1065 + }, + { + "epoch": 0.21151842849347685, + "grad_norm": 35.45757101031014, + "learning_rate": 9.664578142549275e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.53125, + "logps/chosen": -852.0, + "logps/rejected": -584.0, + "loss": 0.4031, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.96484375, + "rewards/margins": 6.37109375, + "rewards/rejected": -3.40625, + "step": 1066 + }, + { + "epoch": 0.21171685103427748, + "grad_norm": 49.75457779455471, + "learning_rate": 9.663396425714387e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 3.90234375, + "logps/chosen": -985.0, + "logps/rejected": -786.0, + "loss": 0.5012, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.421875, + "rewards/margins": 4.2734375, + "rewards/rejected": -2.84765625, + "step": 1067 + }, + { + "epoch": 0.21191527357507814, + "grad_norm": 32.940558891707475, + "learning_rate": 9.662212711772187e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 4.203125, + "logps/chosen": -602.0, + "logps/rejected": -545.0, + "loss": 0.5292, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.541015625, + "rewards/margins": 4.7265625, + "rewards/rejected": -3.19140625, + "step": 1068 + }, + { + "epoch": 0.21211369611587877, + "grad_norm": 36.17351312429782, + "learning_rate": 9.661027001290486e-07, + "logits/chosen": 4.39453125, + "logits/rejected": 4.296875, + "logps/chosen": -1383.5, + "logps/rejected": -857.0, + "loss": 0.3958, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.943359375, + "rewards/margins": 5.8984375, + "rewards/rejected": -3.96875, + "step": 1069 + }, + { + "epoch": 0.2123121186566794, + "grad_norm": 38.65829175317548, + "learning_rate": 9.659839294838047e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.87890625, + "logps/chosen": -964.0, + "logps/rejected": -1324.0, + "loss": 0.5431, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.138671875, + "rewards/margins": 5.0703125, + "rewards/rejected": -3.927734375, + "step": 1070 + }, + { + "epoch": 0.21251054119748003, + "grad_norm": 38.17704343320699, + "learning_rate": 9.658649592984586e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.28125, + "logps/chosen": -1500.0, + "logps/rejected": -844.0, + "loss": 0.3464, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6083984375, + "rewards/margins": 6.76953125, + "rewards/rejected": -4.1640625, + "step": 1071 + }, + { + "epoch": 0.21270896373828066, + "grad_norm": 35.076808437555655, + "learning_rate": 9.657457896300791e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 4.06640625, + "logps/chosen": -993.0, + "logps/rejected": -733.0, + "loss": 0.4852, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.830078125, + "rewards/margins": 5.921875, + "rewards/rejected": -4.08984375, + "step": 1072 + }, + { + "epoch": 0.2129073862790813, + "grad_norm": 27.830168876566077, + "learning_rate": 9.65626420535829e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.28515625, + "logps/chosen": -892.0, + "logps/rejected": -744.5, + "loss": 0.4619, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.4873046875, + "rewards/margins": 5.640625, + "rewards/rejected": -4.15625, + "step": 1073 + }, + { + "epoch": 0.21310580881988195, + "grad_norm": 33.11220210316952, + "learning_rate": 9.65506852072968e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.59375, + "logps/chosen": -1112.0, + "logps/rejected": -664.0, + "loss": 0.486, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.3681640625, + "rewards/margins": 5.46875, + "rewards/rejected": -4.09765625, + "step": 1074 + }, + { + "epoch": 0.21330423136068258, + "grad_norm": 42.523045757935094, + "learning_rate": 9.653870842988509e-07, + "logits/chosen": 3.6953125, + "logits/rejected": 3.98828125, + "logps/chosen": -676.5, + "logps/rejected": -648.5, + "loss": 0.4806, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.5732421875, + "rewards/margins": 4.98046875, + "rewards/rejected": -3.41015625, + "step": 1075 + }, + { + "epoch": 0.2135026539014832, + "grad_norm": 44.737607314488706, + "learning_rate": 9.652671172709279e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.06640625, + "logps/chosen": -785.5, + "logps/rejected": -732.0, + "loss": 0.5699, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.38671875, + "rewards/margins": 4.6953125, + "rewards/rejected": -3.3046875, + "step": 1076 + }, + { + "epoch": 0.21370107644228384, + "grad_norm": 36.594366831475604, + "learning_rate": 9.651469510467455e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.1953125, + "logps/chosen": -1130.0, + "logps/rejected": -841.0, + "loss": 0.3805, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.8203125, + "rewards/margins": 7.2578125, + "rewards/rejected": -5.43359375, + "step": 1077 + }, + { + "epoch": 0.21389949898308447, + "grad_norm": 37.740017357306634, + "learning_rate": 9.650265856839449e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.140625, + "logps/chosen": -1159.0, + "logps/rejected": -1452.0, + "loss": 0.4749, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0234375, + "rewards/margins": 5.982421875, + "rewards/rejected": -6.01171875, + "step": 1078 + }, + { + "epoch": 0.21409792152388513, + "grad_norm": 38.04897325315788, + "learning_rate": 9.649060212402632e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 3.86328125, + "logps/chosen": -947.0, + "logps/rejected": -597.5, + "loss": 0.3528, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.57421875, + "rewards/margins": 6.65625, + "rewards/rejected": -5.08203125, + "step": 1079 + }, + { + "epoch": 0.21429634406468576, + "grad_norm": 35.26056244662481, + "learning_rate": 9.647852577735334e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.1484375, + "logps/chosen": -1109.0, + "logps/rejected": -696.5, + "loss": 0.3919, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.703125, + "rewards/margins": 7.1875, + "rewards/rejected": -5.4765625, + "step": 1080 + }, + { + "epoch": 0.2144947666054864, + "grad_norm": 40.672460774630395, + "learning_rate": 9.646642953416834e-07, + "logits/chosen": 3.921875, + "logits/rejected": 3.81640625, + "logps/chosen": -1018.0, + "logps/rejected": -685.0, + "loss": 0.6224, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.85693359375, + "rewards/margins": 4.3671875, + "rewards/rejected": -3.515625, + "step": 1081 + }, + { + "epoch": 0.21469318914628702, + "grad_norm": 38.46050074831359, + "learning_rate": 9.645431340027368e-07, + "logits/chosen": 4.4140625, + "logits/rejected": 4.44921875, + "logps/chosen": -983.0, + "logps/rejected": -607.5, + "loss": 0.5139, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6455078125, + "rewards/margins": 4.53125, + "rewards/rejected": -2.875, + "step": 1082 + }, + { + "epoch": 0.21489161168708765, + "grad_norm": 30.939396375991787, + "learning_rate": 9.644217738148124e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 3.96484375, + "logps/chosen": -964.0, + "logps/rejected": -676.0, + "loss": 0.3347, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.744140625, + "rewards/margins": 6.359375, + "rewards/rejected": -3.6171875, + "step": 1083 + }, + { + "epoch": 0.21509003422788828, + "grad_norm": 31.53322448951792, + "learning_rate": 9.643002148361244e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 4.08203125, + "logps/chosen": -910.0, + "logps/rejected": -613.0, + "loss": 0.4801, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8232421875, + "rewards/margins": 5.1484375, + "rewards/rejected": -3.33203125, + "step": 1084 + }, + { + "epoch": 0.21528845676868893, + "grad_norm": 34.55777483758905, + "learning_rate": 9.641784571249828e-07, + "logits/chosen": 4.8125, + "logits/rejected": 4.52734375, + "logps/chosen": -1061.0, + "logps/rejected": -688.0, + "loss": 0.4031, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.7239990234375, + "rewards/margins": 5.51171875, + "rewards/rejected": -3.796875, + "step": 1085 + }, + { + "epoch": 0.21548687930948957, + "grad_norm": 38.36910015056652, + "learning_rate": 9.640565007397924e-07, + "logits/chosen": 4.33203125, + "logits/rejected": 4.22265625, + "logps/chosen": -1202.0, + "logps/rejected": -720.0, + "loss": 0.406, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.314453125, + "rewards/margins": 5.7734375, + "rewards/rejected": -3.45703125, + "step": 1086 + }, + { + "epoch": 0.2156853018502902, + "grad_norm": 50.70065042983955, + "learning_rate": 9.639343457390538e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 4.16796875, + "logps/chosen": -1037.0, + "logps/rejected": -1178.0, + "loss": 0.5007, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.18115234375, + "rewards/margins": 5.09375, + "rewards/rejected": -3.9140625, + "step": 1087 + }, + { + "epoch": 0.21588372439109083, + "grad_norm": 36.22920717054896, + "learning_rate": 9.638119921813623e-07, + "logits/chosen": 3.71875, + "logits/rejected": 3.5703125, + "logps/chosen": -1065.0, + "logps/rejected": -669.0, + "loss": 0.4388, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.849609375, + "rewards/margins": 5.30078125, + "rewards/rejected": -3.458984375, + "step": 1088 + }, + { + "epoch": 0.21608214693189146, + "grad_norm": 38.07713334294843, + "learning_rate": 9.636894401254087e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.859375, + "logps/chosen": -862.0, + "logps/rejected": -584.5, + "loss": 0.4675, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.34765625, + "rewards/margins": 4.9140625, + "rewards/rejected": -2.5625, + "step": 1089 + }, + { + "epoch": 0.21628056947269209, + "grad_norm": 39.561822008164, + "learning_rate": 9.635666896299792e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 4.1171875, + "logps/chosen": -932.0, + "logps/rejected": -785.0, + "loss": 0.4063, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.4716796875, + "rewards/margins": 5.8828125, + "rewards/rejected": -4.40234375, + "step": 1090 + }, + { + "epoch": 0.21647899201349274, + "grad_norm": 35.98262311105538, + "learning_rate": 9.63443740753955e-07, + "logits/chosen": 4.625, + "logits/rejected": 4.4296875, + "logps/chosen": -977.0, + "logps/rejected": -787.5, + "loss": 0.5112, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.86328125, + "rewards/margins": 5.23046875, + "rewards/rejected": -3.37109375, + "step": 1091 + }, + { + "epoch": 0.21667741455429337, + "grad_norm": 35.35485538857611, + "learning_rate": 9.633205935563126e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 3.6796875, + "logps/chosen": -1094.0, + "logps/rejected": -584.5, + "loss": 0.4182, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.8056640625, + "rewards/margins": 5.25, + "rewards/rejected": -3.45703125, + "step": 1092 + }, + { + "epoch": 0.216875837095094, + "grad_norm": 36.98777794335005, + "learning_rate": 9.631972480961233e-07, + "logits/chosen": 3.90625, + "logits/rejected": 4.03515625, + "logps/chosen": -1041.0, + "logps/rejected": -704.5, + "loss": 0.4422, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.59375, + "rewards/margins": 5.23046875, + "rewards/rejected": -3.6328125, + "step": 1093 + }, + { + "epoch": 0.21707425963589463, + "grad_norm": 32.71642931598577, + "learning_rate": 9.630737044325537e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 3.99609375, + "logps/chosen": -1040.0, + "logps/rejected": -767.0, + "loss": 0.4136, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.8916015625, + "rewards/margins": 6.4765625, + "rewards/rejected": -4.578125, + "step": 1094 + }, + { + "epoch": 0.21727268217669526, + "grad_norm": 37.01576410876059, + "learning_rate": 9.629499626248658e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.09375, + "logps/chosen": -783.0, + "logps/rejected": -489.5, + "loss": 0.6094, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.306640625, + "rewards/margins": 4.01171875, + "rewards/rejected": -2.7109375, + "step": 1095 + }, + { + "epoch": 0.2174711047174959, + "grad_norm": 32.19446984530777, + "learning_rate": 9.628260227324161e-07, + "logits/chosen": 3.890625, + "logits/rejected": 4.06640625, + "logps/chosen": -946.0, + "logps/rejected": -697.5, + "loss": 0.3427, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.228515625, + "rewards/margins": 7.2421875, + "rewards/rejected": -5.00390625, + "step": 1096 + }, + { + "epoch": 0.21766952725829655, + "grad_norm": 40.42103845154566, + "learning_rate": 9.627018848146563e-07, + "logits/chosen": 3.65625, + "logits/rejected": 4.0078125, + "logps/chosen": -1058.0, + "logps/rejected": -832.0, + "loss": 0.5005, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1494140625, + "rewards/margins": 5.681640625, + "rewards/rejected": -4.53125, + "step": 1097 + }, + { + "epoch": 0.21786794979909718, + "grad_norm": 32.44989980936389, + "learning_rate": 9.625775489311336e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 4.01953125, + "logps/chosen": -1324.0, + "logps/rejected": -895.0, + "loss": 0.356, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.974609375, + "rewards/margins": 8.1640625, + "rewards/rejected": -6.1796875, + "step": 1098 + }, + { + "epoch": 0.2180663723398978, + "grad_norm": 35.06293116013013, + "learning_rate": 9.624530151414893e-07, + "logits/chosen": 3.84375, + "logits/rejected": 3.921875, + "logps/chosen": -827.0, + "logps/rejected": -665.5, + "loss": 0.5048, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.80078125, + "rewards/margins": 5.4765625, + "rewards/rejected": -3.6875, + "step": 1099 + }, + { + "epoch": 0.21826479488069844, + "grad_norm": 36.471245202849616, + "learning_rate": 9.6232828350546e-07, + "logits/chosen": 3.921875, + "logits/rejected": 3.8125, + "logps/chosen": -791.0, + "logps/rejected": -473.5, + "loss": 0.4631, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.798828125, + "rewards/margins": 5.3125, + "rewards/rejected": -3.51953125, + "step": 1100 + }, + { + "epoch": 0.21846321742149907, + "grad_norm": 36.1249088290653, + "learning_rate": 9.622033540828775e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.7421875, + "logps/chosen": -749.5, + "logps/rejected": -1840.0, + "loss": 0.5049, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.16796875, + "rewards/margins": 6.6015625, + "rewards/rejected": -5.4375, + "step": 1101 + }, + { + "epoch": 0.21866163996229973, + "grad_norm": 32.221357769216795, + "learning_rate": 9.620782269336682e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.3046875, + "logps/chosen": -1007.0, + "logps/rejected": -1105.0, + "loss": 0.3971, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.6630859375, + "rewards/margins": 7.5859375, + "rewards/rejected": -5.9375, + "step": 1102 + }, + { + "epoch": 0.21886006250310036, + "grad_norm": 47.59498995692428, + "learning_rate": 9.61952902117853e-07, + "logits/chosen": 3.38671875, + "logits/rejected": 3.5859375, + "logps/chosen": -952.0, + "logps/rejected": -623.0, + "loss": 0.5155, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.642333984375, + "rewards/margins": 4.6484375, + "rewards/rejected": -3.01171875, + "step": 1103 + }, + { + "epoch": 0.219058485043901, + "grad_norm": 35.466584664788265, + "learning_rate": 9.618273796955487e-07, + "logits/chosen": 4.5703125, + "logits/rejected": 4.6171875, + "logps/chosen": -1051.0, + "logps/rejected": -672.0, + "loss": 0.4968, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.06640625, + "rewards/margins": 4.33203125, + "rewards/rejected": -2.26953125, + "step": 1104 + }, + { + "epoch": 0.21925690758470162, + "grad_norm": 34.32568352668577, + "learning_rate": 9.617016597269654e-07, + "logits/chosen": 3.6875, + "logits/rejected": 3.89453125, + "logps/chosen": -957.0, + "logps/rejected": -775.0, + "loss": 0.3535, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.009765625, + "rewards/margins": 5.859375, + "rewards/rejected": -3.8515625, + "step": 1105 + }, + { + "epoch": 0.21945533012550225, + "grad_norm": 40.8350053273531, + "learning_rate": 9.615757422724095e-07, + "logits/chosen": 3.7109375, + "logits/rejected": 3.6640625, + "logps/chosen": -631.0, + "logps/rejected": -499.0, + "loss": 0.4569, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.4208984375, + "rewards/margins": 4.90234375, + "rewards/rejected": -3.484375, + "step": 1106 + }, + { + "epoch": 0.21965375266630288, + "grad_norm": 29.43126693813735, + "learning_rate": 9.614496273922808e-07, + "logits/chosen": 4.5546875, + "logits/rejected": 4.921875, + "logps/chosen": -974.0, + "logps/rejected": -745.5, + "loss": 0.4932, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.009765625, + "rewards/margins": 5.77734375, + "rewards/rejected": -3.77734375, + "step": 1107 + }, + { + "epoch": 0.21985217520710354, + "grad_norm": 40.823914000487434, + "learning_rate": 9.613233151470743e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 3.9296875, + "logps/chosen": -935.0, + "logps/rejected": -1460.5, + "loss": 0.4659, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.07421875, + "rewards/margins": 6.765625, + "rewards/rejected": -4.693359375, + "step": 1108 + }, + { + "epoch": 0.22005059774790417, + "grad_norm": 35.08106859054364, + "learning_rate": 9.6119680559738e-07, + "logits/chosen": 4.26171875, + "logits/rejected": 4.20703125, + "logps/chosen": -1110.0, + "logps/rejected": -935.0, + "loss": 0.438, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.96484375, + "rewards/margins": 5.171875, + "rewards/rejected": -3.20703125, + "step": 1109 + }, + { + "epoch": 0.2202490202887048, + "grad_norm": 35.51501117803131, + "learning_rate": 9.610700988038825e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.4453125, + "logps/chosen": -960.0, + "logps/rejected": -1009.0, + "loss": 0.4363, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4091796875, + "rewards/margins": 6.04296875, + "rewards/rejected": -3.625, + "step": 1110 + }, + { + "epoch": 0.22044744282950543, + "grad_norm": 39.724040932622884, + "learning_rate": 9.609431948273606e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 4.140625, + "logps/chosen": -1353.0, + "logps/rejected": -829.5, + "loss": 0.3646, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4052734375, + "rewards/margins": 6.59375, + "rewards/rejected": -4.1875, + "step": 1111 + }, + { + "epoch": 0.22064586537030606, + "grad_norm": 34.521075873700816, + "learning_rate": 9.608160937286876e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.74609375, + "logps/chosen": -937.5, + "logps/rejected": -1936.0, + "loss": 0.5067, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.41162109375, + "rewards/margins": 7.6171875, + "rewards/rejected": -6.203125, + "step": 1112 + }, + { + "epoch": 0.2208442879111067, + "grad_norm": 32.48125509702147, + "learning_rate": 9.60688795568832e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.8984375, + "logps/chosen": -1214.0, + "logps/rejected": -879.0, + "loss": 0.3647, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.640625, + "rewards/margins": 5.88671875, + "rewards/rejected": -3.24609375, + "step": 1113 + }, + { + "epoch": 0.22104271045190735, + "grad_norm": 35.20917608226541, + "learning_rate": 9.605613004088564e-07, + "logits/chosen": 3.78515625, + "logits/rejected": 3.4375, + "logps/chosen": -798.0, + "logps/rejected": -488.5, + "loss": 0.5077, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.52001953125, + "rewards/margins": 4.13671875, + "rewards/rejected": -2.6171875, + "step": 1114 + }, + { + "epoch": 0.22124113299270798, + "grad_norm": 37.39119492849877, + "learning_rate": 9.604336083099178e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.01171875, + "logps/chosen": -1150.0, + "logps/rejected": -799.0, + "loss": 0.4389, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4560546875, + "rewards/margins": 5.66796875, + "rewards/rejected": -4.21875, + "step": 1115 + }, + { + "epoch": 0.2214395555335086, + "grad_norm": 33.805307461044855, + "learning_rate": 9.60305719333268e-07, + "logits/chosen": 3.6328125, + "logits/rejected": 3.70703125, + "logps/chosen": -590.0, + "logps/rejected": -498.5, + "loss": 0.4991, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.53955078125, + "rewards/margins": 4.328125, + "rewards/rejected": -2.7890625, + "step": 1116 + }, + { + "epoch": 0.22163797807430924, + "grad_norm": 29.309090009353124, + "learning_rate": 9.601776335402529e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.0859375, + "logps/chosen": -1238.0, + "logps/rejected": -887.0, + "loss": 0.375, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.79296875, + "rewards/margins": 8.15625, + "rewards/rejected": -5.375, + "step": 1117 + }, + { + "epoch": 0.22183640061510987, + "grad_norm": 38.25587105752024, + "learning_rate": 9.60049350992313e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.171875, + "logps/chosen": -1012.5, + "logps/rejected": -882.5, + "loss": 0.4791, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.90625, + "rewards/margins": 6.828125, + "rewards/rejected": -4.92578125, + "step": 1118 + }, + { + "epoch": 0.2220348231559105, + "grad_norm": 40.750462890491676, + "learning_rate": 9.599208717509836e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 4.15625, + "logps/chosen": -709.5, + "logps/rejected": -630.5, + "loss": 0.487, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.625244140625, + "rewards/margins": 5.7734375, + "rewards/rejected": -5.13671875, + "step": 1119 + }, + { + "epoch": 0.22223324569671116, + "grad_norm": 48.72432153087625, + "learning_rate": 9.597921958778934e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.140625, + "logps/chosen": -1089.0, + "logps/rejected": -1273.0, + "loss": 0.4586, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.1611328125, + "rewards/margins": 7.7578125, + "rewards/rejected": -5.59765625, + "step": 1120 + }, + { + "epoch": 0.2224316682375118, + "grad_norm": 36.36715247759585, + "learning_rate": 9.59663323434766e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 3.765625, + "logps/chosen": -862.0, + "logps/rejected": -567.5, + "loss": 0.5607, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.554443359375, + "rewards/margins": 4.66015625, + "rewards/rejected": -3.1015625, + "step": 1121 + }, + { + "epoch": 0.22263009077831242, + "grad_norm": 44.769683896964196, + "learning_rate": 9.595342544834193e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.2265625, + "logps/chosen": -1165.0, + "logps/rejected": -720.5, + "loss": 0.4916, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.53125, + "rewards/margins": 6.84375, + "rewards/rejected": -5.3125, + "step": 1122 + }, + { + "epoch": 0.22282851331911305, + "grad_norm": 41.00787429155131, + "learning_rate": 9.594049890857655e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 3.8203125, + "logps/chosen": -888.0, + "logps/rejected": -517.5, + "loss": 0.5509, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.01611328125, + "rewards/margins": 4.9921875, + "rewards/rejected": -3.98046875, + "step": 1123 + }, + { + "epoch": 0.22302693585991368, + "grad_norm": 43.91349856831621, + "learning_rate": 9.592755273038107e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.53125, + "logps/chosen": -1040.0, + "logps/rejected": -1222.0, + "loss": 0.4726, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.16796875, + "rewards/margins": 6.4921875, + "rewards/rejected": -5.337890625, + "step": 1124 + }, + { + "epoch": 0.2232253584007143, + "grad_norm": 33.458062578829804, + "learning_rate": 9.591458691996557e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.15625, + "logps/chosen": -809.0, + "logps/rejected": -675.0, + "loss": 0.5524, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.376953125, + "rewards/margins": 4.8125, + "rewards/rejected": -3.427734375, + "step": 1125 + }, + { + "epoch": 0.22342378094151497, + "grad_norm": 41.309385439060854, + "learning_rate": 9.590160148354948e-07, + "logits/chosen": 3.62109375, + "logits/rejected": 3.75, + "logps/chosen": -1217.0, + "logps/rejected": -863.0, + "loss": 0.3915, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.009765625, + "rewards/margins": 6.2421875, + "rewards/rejected": -4.23828125, + "step": 1126 + }, + { + "epoch": 0.2236222034823156, + "grad_norm": 40.84384922887815, + "learning_rate": 9.588859642736172e-07, + "logits/chosen": 3.79296875, + "logits/rejected": 3.953125, + "logps/chosen": -912.0, + "logps/rejected": -626.0, + "loss": 0.3333, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.140625, + "rewards/margins": 6.1796875, + "rewards/rejected": -4.0390625, + "step": 1127 + }, + { + "epoch": 0.22382062602311623, + "grad_norm": 40.64630378032827, + "learning_rate": 9.587557175764055e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.2421875, + "logps/chosen": -1415.0, + "logps/rejected": -981.0, + "loss": 0.34, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.76171875, + "rewards/margins": 7.046875, + "rewards/rejected": -4.296875, + "step": 1128 + }, + { + "epoch": 0.22401904856391686, + "grad_norm": 37.89500330101851, + "learning_rate": 9.58625274806337e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.28125, + "logps/chosen": -975.0, + "logps/rejected": -653.0, + "loss": 0.5049, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.16015625, + "rewards/margins": 12.34375, + "rewards/rejected": -11.13671875, + "step": 1129 + }, + { + "epoch": 0.2242174711047175, + "grad_norm": 34.3100449676276, + "learning_rate": 9.58494636025983e-07, + "logits/chosen": 3.890625, + "logits/rejected": 4.15234375, + "logps/chosen": -1083.0, + "logps/rejected": -881.0, + "loss": 0.4142, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.552734375, + "rewards/margins": 7.3515625, + "rewards/rejected": -4.79296875, + "step": 1130 + }, + { + "epoch": 0.22441589364551814, + "grad_norm": 32.42641685058721, + "learning_rate": 9.583638012980079e-07, + "logits/chosen": 4.3515625, + "logits/rejected": 4.00390625, + "logps/chosen": -933.0, + "logps/rejected": -733.0, + "loss": 0.4581, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.00390625, + "rewards/margins": 4.58203125, + "rewards/rejected": -2.58203125, + "step": 1131 + }, + { + "epoch": 0.22461431618631877, + "grad_norm": 31.6292778479836, + "learning_rate": 9.582327706851716e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.5625, + "logps/chosen": -1509.0, + "logps/rejected": -1758.5, + "loss": 0.3539, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.26953125, + "rewards/margins": 9.5390625, + "rewards/rejected": -7.2578125, + "step": 1132 + }, + { + "epoch": 0.2248127387271194, + "grad_norm": 36.33759172202822, + "learning_rate": 9.581015442503265e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 4.296875, + "logps/chosen": -883.0, + "logps/rejected": -562.5, + "loss": 0.3567, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.603515625, + "rewards/margins": 6.33984375, + "rewards/rejected": -3.7421875, + "step": 1133 + }, + { + "epoch": 0.22501116126792003, + "grad_norm": 49.15256084331953, + "learning_rate": 9.579701220564198e-07, + "logits/chosen": 4.14453125, + "logits/rejected": 4.03515625, + "logps/chosen": -1033.0, + "logps/rejected": -910.0, + "loss": 0.5007, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.67333984375, + "rewards/margins": 4.4140625, + "rewards/rejected": -2.738037109375, + "step": 1134 + }, + { + "epoch": 0.22520958380872066, + "grad_norm": 36.18282466305997, + "learning_rate": 9.578385041664925e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 3.8203125, + "logps/chosen": -628.0, + "logps/rejected": -411.0, + "loss": 0.5453, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.255859375, + "rewards/margins": 4.3515625, + "rewards/rejected": -3.0859375, + "step": 1135 + }, + { + "epoch": 0.2254080063495213, + "grad_norm": 40.12471387421024, + "learning_rate": 9.577066906436793e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 4.08984375, + "logps/chosen": -798.0, + "logps/rejected": -784.0, + "loss": 0.5701, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.318359375, + "rewards/margins": 6.515625, + "rewards/rejected": -5.19140625, + "step": 1136 + }, + { + "epoch": 0.22560642889032195, + "grad_norm": 32.665921456207, + "learning_rate": 9.575746815512087e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 4.21484375, + "logps/chosen": -1049.5, + "logps/rejected": -606.5, + "loss": 0.5224, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.10546875, + "rewards/margins": 4.6953125, + "rewards/rejected": -2.5859375, + "step": 1137 + }, + { + "epoch": 0.22580485143112258, + "grad_norm": 46.05722538732213, + "learning_rate": 9.574424769524033e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 3.91015625, + "logps/chosen": -1123.0, + "logps/rejected": -1006.0, + "loss": 0.3967, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.9541015625, + "rewards/margins": 7.15625, + "rewards/rejected": -5.19140625, + "step": 1138 + }, + { + "epoch": 0.2260032739719232, + "grad_norm": 38.649636207503065, + "learning_rate": 9.573100769106793e-07, + "logits/chosen": 3.55078125, + "logits/rejected": 3.3984375, + "logps/chosen": -1249.0, + "logps/rejected": -683.0, + "loss": 0.4819, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6640625, + "rewards/margins": 5.78125, + "rewards/rejected": -5.11328125, + "step": 1139 + }, + { + "epoch": 0.22620169651272384, + "grad_norm": 36.341467545160874, + "learning_rate": 9.571774814895464e-07, + "logits/chosen": 3.7734375, + "logits/rejected": 3.578125, + "logps/chosen": -807.0, + "logps/rejected": -773.0, + "loss": 0.4277, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.80078125, + "rewards/margins": 15.5234375, + "rewards/rejected": -13.73046875, + "step": 1140 + }, + { + "epoch": 0.22640011905352447, + "grad_norm": 35.24707262967627, + "learning_rate": 9.570446907526087e-07, + "logits/chosen": 3.71484375, + "logits/rejected": 3.69140625, + "logps/chosen": -1264.0, + "logps/rejected": -963.0, + "loss": 0.4084, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.791015625, + "rewards/margins": 7.703125, + "rewards/rejected": -5.90625, + "step": 1141 + }, + { + "epoch": 0.2265985415943251, + "grad_norm": 33.858134005074845, + "learning_rate": 9.56911704763563e-07, + "logits/chosen": 3.75390625, + "logits/rejected": 3.50390625, + "logps/chosen": -831.0, + "logps/rejected": -653.5, + "loss": 0.4387, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.5712890625, + "rewards/margins": 5.2265625, + "rewards/rejected": -3.6484375, + "step": 1142 + }, + { + "epoch": 0.22679696413512576, + "grad_norm": 39.27268160008222, + "learning_rate": 9.567785235862008e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 3.8671875, + "logps/chosen": -859.0, + "logps/rejected": -655.0, + "loss": 0.4773, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.66015625, + "rewards/margins": 5.3203125, + "rewards/rejected": -3.66015625, + "step": 1143 + }, + { + "epoch": 0.2269953866759264, + "grad_norm": 33.98872263701253, + "learning_rate": 9.566451472844064e-07, + "logits/chosen": 3.859375, + "logits/rejected": 3.91015625, + "logps/chosen": -1407.0, + "logps/rejected": -1058.0, + "loss": 0.5402, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.94140625, + "rewards/margins": 5.76171875, + "rewards/rejected": -4.82421875, + "step": 1144 + }, + { + "epoch": 0.22719380921672702, + "grad_norm": 34.89561892739984, + "learning_rate": 9.565115759221584e-07, + "logits/chosen": 3.76953125, + "logits/rejected": 3.82421875, + "logps/chosen": -1044.0, + "logps/rejected": -653.5, + "loss": 0.402, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.125, + "rewards/margins": 6.2421875, + "rewards/rejected": -4.1171875, + "step": 1145 + }, + { + "epoch": 0.22739223175752765, + "grad_norm": 38.336581010038145, + "learning_rate": 9.563778095635282e-07, + "logits/chosen": 3.734375, + "logits/rejected": 3.96875, + "logps/chosen": -1230.0, + "logps/rejected": -917.0, + "loss": 0.5308, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.0439453125, + "rewards/margins": 6.29296875, + "rewards/rejected": -4.2578125, + "step": 1146 + }, + { + "epoch": 0.22759065429832828, + "grad_norm": 37.58498987423858, + "learning_rate": 9.562438482726814e-07, + "logits/chosen": 3.9375, + "logits/rejected": 4.3046875, + "logps/chosen": -996.0, + "logps/rejected": -1092.0, + "loss": 0.4475, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6748046875, + "rewards/margins": 6.271484375, + "rewards/rejected": -4.609375, + "step": 1147 + }, + { + "epoch": 0.2277890768391289, + "grad_norm": 42.022441070238735, + "learning_rate": 9.56109692113877e-07, + "logits/chosen": 3.73046875, + "logits/rejected": 3.8828125, + "logps/chosen": -937.0, + "logps/rejected": -658.0, + "loss": 0.5269, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.011962890625, + "rewards/margins": 4.3828125, + "rewards/rejected": -3.36328125, + "step": 1148 + }, + { + "epoch": 0.22798749937992957, + "grad_norm": 26.15994980537805, + "learning_rate": 9.55975341151467e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.05859375, + "logps/chosen": -971.0, + "logps/rejected": -591.0, + "loss": 0.3078, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.56640625, + "rewards/margins": 6.3828125, + "rewards/rejected": -3.828125, + "step": 1149 + }, + { + "epoch": 0.2281859219207302, + "grad_norm": 34.69186849392785, + "learning_rate": 9.558407954498972e-07, + "logits/chosen": 3.84375, + "logits/rejected": 4.05078125, + "logps/chosen": -971.0, + "logps/rejected": -867.5, + "loss": 0.4817, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.994140625, + "rewards/margins": 5.5703125, + "rewards/rejected": -3.568359375, + "step": 1150 + }, + { + "epoch": 0.22838434446153083, + "grad_norm": 38.1018259906187, + "learning_rate": 9.55706055073707e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.15625, + "logps/chosen": -867.0, + "logps/rejected": -495.5, + "loss": 0.3816, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.73828125, + "rewards/margins": 5.421875, + "rewards/rejected": -3.68359375, + "step": 1151 + }, + { + "epoch": 0.22858276700233146, + "grad_norm": 39.40756367857515, + "learning_rate": 9.555711200875289e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 3.96484375, + "logps/chosen": -1009.0, + "logps/rejected": -767.0, + "loss": 0.5089, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.67578125, + "rewards/margins": 5.763671875, + "rewards/rejected": -3.0869140625, + "step": 1152 + }, + { + "epoch": 0.2287811895431321, + "grad_norm": 43.85564223720344, + "learning_rate": 9.554359905560885e-07, + "logits/chosen": 4.23046875, + "logits/rejected": 4.140625, + "logps/chosen": -905.5, + "logps/rejected": -707.0, + "loss": 0.4844, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1650390625, + "rewards/margins": 6.41015625, + "rewards/rejected": -5.24609375, + "step": 1153 + }, + { + "epoch": 0.22897961208393272, + "grad_norm": 36.76778240080286, + "learning_rate": 9.553006665442054e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.25390625, + "logps/chosen": -784.0, + "logps/rejected": -670.5, + "loss": 0.4819, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.2138671875, + "rewards/margins": 5.5078125, + "rewards/rejected": -4.2890625, + "step": 1154 + }, + { + "epoch": 0.22917803462473338, + "grad_norm": 34.23370217512907, + "learning_rate": 9.551651481167923e-07, + "logits/chosen": 3.80859375, + "logits/rejected": 3.87890625, + "logps/chosen": -998.0, + "logps/rejected": -676.0, + "loss": 0.3763, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.080078125, + "rewards/margins": 5.890625, + "rewards/rejected": -3.80859375, + "step": 1155 + }, + { + "epoch": 0.229376457165534, + "grad_norm": 39.1936394031814, + "learning_rate": 9.550294353388546e-07, + "logits/chosen": 4.7578125, + "logits/rejected": 4.421875, + "logps/chosen": -837.0, + "logps/rejected": -576.5, + "loss": 0.4995, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.3486328125, + "rewards/margins": 4.294921875, + "rewards/rejected": -2.9375, + "step": 1156 + }, + { + "epoch": 0.22957487970633464, + "grad_norm": 41.573537259889676, + "learning_rate": 9.54893528275491e-07, + "logits/chosen": 3.5859375, + "logits/rejected": 3.8515625, + "logps/chosen": -956.0, + "logps/rejected": -692.5, + "loss": 0.6654, + "rewards/accuracies": 0.59375, + "rewards/chosen": 1.3427734375, + "rewards/margins": 3.1318359375, + "rewards/rejected": -1.7880859375, + "step": 1157 + }, + { + "epoch": 0.22977330224713527, + "grad_norm": 36.04164735187658, + "learning_rate": 9.547574269918947e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.3671875, + "logps/chosen": -698.0, + "logps/rejected": -627.0, + "loss": 0.6429, + "rewards/accuracies": 0.65625, + "rewards/chosen": 1.33203125, + "rewards/margins": 3.5234375, + "rewards/rejected": -2.19140625, + "step": 1158 + }, + { + "epoch": 0.2299717247879359, + "grad_norm": 36.305685706921494, + "learning_rate": 9.546211315533501e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 4.1171875, + "logps/chosen": -937.0, + "logps/rejected": -558.0, + "loss": 0.496, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.794921875, + "rewards/margins": 5.0625, + "rewards/rejected": -3.2734375, + "step": 1159 + }, + { + "epoch": 0.23017014732873656, + "grad_norm": 35.820081566565136, + "learning_rate": 9.544846420252362e-07, + "logits/chosen": 3.7734375, + "logits/rejected": 4.1015625, + "logps/chosen": -685.0, + "logps/rejected": -556.5, + "loss": 0.6701, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.895751953125, + "rewards/margins": 3.474609375, + "rewards/rejected": -2.580078125, + "step": 1160 + }, + { + "epoch": 0.2303685698695372, + "grad_norm": 33.27334082696235, + "learning_rate": 9.543479584730247e-07, + "logits/chosen": 3.9375, + "logits/rejected": 3.921875, + "logps/chosen": -1040.0, + "logps/rejected": -797.0, + "loss": 0.4011, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.947265625, + "rewards/margins": 5.55859375, + "rewards/rejected": -3.609375, + "step": 1161 + }, + { + "epoch": 0.23056699241033782, + "grad_norm": 42.481459023039584, + "learning_rate": 9.5421108096228e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.1796875, + "logps/chosen": -1044.0, + "logps/rejected": -891.5, + "loss": 0.5401, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.77734375, + "rewards/margins": 5.55078125, + "rewards/rejected": -3.76953125, + "step": 1162 + }, + { + "epoch": 0.23076541495113845, + "grad_norm": 37.91622782609033, + "learning_rate": 9.540740095586595e-07, + "logits/chosen": 4.25, + "logits/rejected": 4.1484375, + "logps/chosen": -886.0, + "logps/rejected": -613.0, + "loss": 0.5023, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.912109375, + "rewards/margins": 5.3671875, + "rewards/rejected": -3.4453125, + "step": 1163 + }, + { + "epoch": 0.23096383749193908, + "grad_norm": 31.281969645048925, + "learning_rate": 9.539367443279146e-07, + "logits/chosen": 4.5625, + "logits/rejected": 4.59375, + "logps/chosen": -870.5, + "logps/rejected": -1304.25, + "loss": 0.4344, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.958984375, + "rewards/margins": 7.1328125, + "rewards/rejected": -5.18359375, + "step": 1164 + }, + { + "epoch": 0.2311622600327397, + "grad_norm": 33.38950540311986, + "learning_rate": 9.537992853358887e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 4.46875, + "logps/chosen": -938.5, + "logps/rejected": -611.0, + "loss": 0.4739, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.1875, + "rewards/margins": 5.37109375, + "rewards/rejected": -4.18359375, + "step": 1165 + }, + { + "epoch": 0.23136068257354037, + "grad_norm": 30.72062644900587, + "learning_rate": 9.536616326485184e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 4.0390625, + "logps/chosen": -719.5, + "logps/rejected": -699.5, + "loss": 0.5125, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.0458984375, + "rewards/margins": 5.28515625, + "rewards/rejected": -4.23828125, + "step": 1166 + }, + { + "epoch": 0.231559105114341, + "grad_norm": 41.6929051100114, + "learning_rate": 9.535237863318333e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.26953125, + "logps/chosen": -794.0, + "logps/rejected": -695.5, + "loss": 0.6007, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4444580078125, + "rewards/margins": 4.578125, + "rewards/rejected": -4.125, + "step": 1167 + }, + { + "epoch": 0.23175752765514163, + "grad_norm": 35.02865585289289, + "learning_rate": 9.533857464519557e-07, + "logits/chosen": 4.36328125, + "logits/rejected": 4.28515625, + "logps/chosen": -733.5, + "logps/rejected": -661.5, + "loss": 0.487, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.3466796875, + "rewards/margins": 5.6875, + "rewards/rejected": -4.34375, + "step": 1168 + }, + { + "epoch": 0.23195595019594226, + "grad_norm": 32.53177113102101, + "learning_rate": 9.53247513075101e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 4.0546875, + "logps/chosen": -1184.0, + "logps/rejected": -1557.0, + "loss": 0.3562, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.416015625, + "rewards/margins": 10.2421875, + "rewards/rejected": -7.83203125, + "step": 1169 + }, + { + "epoch": 0.2321543727367429, + "grad_norm": 28.728342630415238, + "learning_rate": 9.531090862675772e-07, + "logits/chosen": 3.8515625, + "logits/rejected": 3.81640625, + "logps/chosen": -1248.0, + "logps/rejected": -763.5, + "loss": 0.5028, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.193359375, + "rewards/margins": 5.4453125, + "rewards/rejected": -3.244140625, + "step": 1170 + }, + { + "epoch": 0.23235279527754352, + "grad_norm": 35.440095318173526, + "learning_rate": 9.529704660957854e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 4.08984375, + "logps/chosen": -719.5, + "logps/rejected": -503.5, + "loss": 0.5039, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.39453125, + "rewards/margins": 4.1796875, + "rewards/rejected": -2.78515625, + "step": 1171 + }, + { + "epoch": 0.23255121781834417, + "grad_norm": 39.430899521746696, + "learning_rate": 9.52831652626219e-07, + "logits/chosen": 3.828125, + "logits/rejected": 3.9921875, + "logps/chosen": -842.5, + "logps/rejected": -822.0, + "loss": 0.4899, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.068359375, + "rewards/margins": 4.9921875, + "rewards/rejected": -2.91796875, + "step": 1172 + }, + { + "epoch": 0.2327496403591448, + "grad_norm": 38.48623664816168, + "learning_rate": 9.526926459254645e-07, + "logits/chosen": 4.4140625, + "logits/rejected": 4.46484375, + "logps/chosen": -869.0, + "logps/rejected": -616.5, + "loss": 0.5451, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.0, + "rewards/margins": 3.80859375, + "rewards/rejected": -1.80859375, + "step": 1173 + }, + { + "epoch": 0.23294806289994544, + "grad_norm": 32.29207352932414, + "learning_rate": 9.525534460602009e-07, + "logits/chosen": 4.4375, + "logits/rejected": 4.453125, + "logps/chosen": -1215.0, + "logps/rejected": -1035.0, + "loss": 0.4318, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.607421875, + "rewards/margins": 7.25, + "rewards/rejected": -4.63671875, + "step": 1174 + }, + { + "epoch": 0.23314648544074607, + "grad_norm": 33.30320706543514, + "learning_rate": 9.524140530971998e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 4.27734375, + "logps/chosen": -1082.0, + "logps/rejected": -2740.5, + "loss": 0.3453, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.18359375, + "rewards/margins": 9.9765625, + "rewards/rejected": -7.80859375, + "step": 1175 + }, + { + "epoch": 0.2333449079815467, + "grad_norm": 36.612322044503784, + "learning_rate": 9.522744671033257e-07, + "logits/chosen": 4.3828125, + "logits/rejected": 4.3515625, + "logps/chosen": -1241.5, + "logps/rejected": -743.0, + "loss": 0.4089, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.107421875, + "rewards/margins": 6.01953125, + "rewards/rejected": -3.90234375, + "step": 1176 + }, + { + "epoch": 0.23354333052234733, + "grad_norm": 26.41789679085262, + "learning_rate": 9.521346881455354e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.359375, + "logps/chosen": -849.0, + "logps/rejected": -1243.0, + "loss": 0.3919, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.052734375, + "rewards/margins": 7.27734375, + "rewards/rejected": -5.2421875, + "step": 1177 + }, + { + "epoch": 0.23374175306314798, + "grad_norm": 35.9428969385294, + "learning_rate": 9.519947162908785e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.3046875, + "logps/chosen": -998.0, + "logps/rejected": -653.0, + "loss": 0.3449, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.810546875, + "rewards/margins": 6.078125, + "rewards/rejected": -4.2734375, + "step": 1178 + }, + { + "epoch": 0.2339401756039486, + "grad_norm": 37.85091199708223, + "learning_rate": 9.51854551606497e-07, + "logits/chosen": 3.8515625, + "logits/rejected": 3.9453125, + "logps/chosen": -889.0, + "logps/rejected": -752.5, + "loss": 0.478, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.4873046875, + "rewards/margins": 5.578125, + "rewards/rejected": -4.087890625, + "step": 1179 + }, + { + "epoch": 0.23413859814474924, + "grad_norm": 39.43096775086384, + "learning_rate": 9.517141941596252e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.078125, + "logps/chosen": -1153.0, + "logps/rejected": -679.0, + "loss": 0.436, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.837890625, + "rewards/margins": 5.24609375, + "rewards/rejected": -3.40625, + "step": 1180 + }, + { + "epoch": 0.23433702068554987, + "grad_norm": 35.31751076256811, + "learning_rate": 9.515736440175903e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 3.93359375, + "logps/chosen": -1127.0, + "logps/rejected": -678.0, + "loss": 0.4391, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.478515625, + "rewards/margins": 6.4609375, + "rewards/rejected": -3.98828125, + "step": 1181 + }, + { + "epoch": 0.2345354432263505, + "grad_norm": 36.42945083771838, + "learning_rate": 9.514329012478115e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 3.94921875, + "logps/chosen": -750.0, + "logps/rejected": -470.0, + "loss": 0.4994, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.5693359375, + "rewards/margins": 4.51953125, + "rewards/rejected": -2.95703125, + "step": 1182 + }, + { + "epoch": 0.23473386576715116, + "grad_norm": 42.56444729484485, + "learning_rate": 9.512919659178007e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 3.9609375, + "logps/chosen": -911.0, + "logps/rejected": -663.0, + "loss": 0.4108, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9248046875, + "rewards/margins": 6.0703125, + "rewards/rejected": -4.140625, + "step": 1183 + }, + { + "epoch": 0.2349322883079518, + "grad_norm": 35.57900079973549, + "learning_rate": 9.511508380951624e-07, + "logits/chosen": 4.53125, + "logits/rejected": 4.53125, + "logps/chosen": -1513.0, + "logps/rejected": -1189.0, + "loss": 0.4673, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.720703125, + "rewards/margins": 6.69921875, + "rewards/rejected": -5.984375, + "step": 1184 + }, + { + "epoch": 0.23513071084875242, + "grad_norm": 41.985515258819234, + "learning_rate": 9.510095178475925e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 3.91796875, + "logps/chosen": -1085.0, + "logps/rejected": -739.5, + "loss": 0.4739, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.826171875, + "rewards/margins": 6.75, + "rewards/rejected": -4.94140625, + "step": 1185 + }, + { + "epoch": 0.23532913338955305, + "grad_norm": 31.697005196084817, + "learning_rate": 9.508680052428804e-07, + "logits/chosen": 3.66796875, + "logits/rejected": 4.1953125, + "logps/chosen": -875.0, + "logps/rejected": -1920.0, + "loss": 0.6056, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.92236328125, + "rewards/margins": 7.091796875, + "rewards/rejected": -6.1796875, + "step": 1186 + }, + { + "epoch": 0.23552755593035368, + "grad_norm": 52.00377468289682, + "learning_rate": 9.507263003489068e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.30859375, + "logps/chosen": -1125.0, + "logps/rejected": -984.5, + "loss": 0.3309, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.484375, + "rewards/margins": 7.328125, + "rewards/rejected": -4.83984375, + "step": 1187 + }, + { + "epoch": 0.2357259784711543, + "grad_norm": 42.87140624726413, + "learning_rate": 9.505844032336451e-07, + "logits/chosen": 4.05078125, + "logits/rejected": 4.23828125, + "logps/chosen": -1058.0, + "logps/rejected": -746.0, + "loss": 0.451, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.046875, + "rewards/margins": 6.84375, + "rewards/rejected": -4.8046875, + "step": 1188 + }, + { + "epoch": 0.23592440101195497, + "grad_norm": 34.66404093486851, + "learning_rate": 9.504423139651609e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.22265625, + "logps/chosen": -830.0, + "logps/rejected": -539.0, + "loss": 0.4466, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.943359375, + "rewards/margins": 5.5703125, + "rewards/rejected": -3.62109375, + "step": 1189 + }, + { + "epoch": 0.2361228235527556, + "grad_norm": 37.5297960455304, + "learning_rate": 9.503000326116117e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 3.90234375, + "logps/chosen": -1127.0, + "logps/rejected": -1187.5, + "loss": 0.4825, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.807373046875, + "rewards/margins": 6.87109375, + "rewards/rejected": -5.08203125, + "step": 1190 + }, + { + "epoch": 0.23632124609355623, + "grad_norm": 62.926082296393446, + "learning_rate": 9.501575592412476e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.14453125, + "logps/chosen": -1352.0, + "logps/rejected": -1199.0, + "loss": 0.4633, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5927734375, + "rewards/margins": 7.7734375, + "rewards/rejected": -5.18359375, + "step": 1191 + }, + { + "epoch": 0.23651966863435686, + "grad_norm": 39.7277702506824, + "learning_rate": 9.500148939224106e-07, + "logits/chosen": 4.3828125, + "logits/rejected": 4.4921875, + "logps/chosen": -904.0, + "logps/rejected": -1379.5, + "loss": 0.6375, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.06787109375, + "rewards/margins": 5.20703125, + "rewards/rejected": -4.146484375, + "step": 1192 + }, + { + "epoch": 0.2367180911751575, + "grad_norm": 41.361384120234575, + "learning_rate": 9.498720367235345e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.2734375, + "logps/chosen": -938.0, + "logps/rejected": -1074.0, + "loss": 0.4079, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.4814453125, + "rewards/margins": 6.89453125, + "rewards/rejected": -5.41796875, + "step": 1193 + }, + { + "epoch": 0.23691651371595812, + "grad_norm": 37.51219951400606, + "learning_rate": 9.497289877131452e-07, + "logits/chosen": 3.73046875, + "logits/rejected": 3.69921875, + "logps/chosen": -816.0, + "logps/rejected": -611.0, + "loss": 0.508, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1669921875, + "rewards/margins": 4.52734375, + "rewards/rejected": -3.36328125, + "step": 1194 + }, + { + "epoch": 0.23711493625675878, + "grad_norm": 38.579737355233526, + "learning_rate": 9.495857469598612e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.1328125, + "logps/chosen": -957.0, + "logps/rejected": -656.0, + "loss": 0.4312, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.02734375, + "rewards/margins": 6.12109375, + "rewards/rejected": -4.0927734375, + "step": 1195 + }, + { + "epoch": 0.2373133587975594, + "grad_norm": 36.98162615711095, + "learning_rate": 9.494423145323923e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 4.23828125, + "logps/chosen": -664.5, + "logps/rejected": -731.0, + "loss": 0.5529, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.60986328125, + "rewards/margins": 4.34375, + "rewards/rejected": -2.71875, + "step": 1196 + }, + { + "epoch": 0.23751178133836004, + "grad_norm": 37.21595002186119, + "learning_rate": 9.492986904995405e-07, + "logits/chosen": 3.59375, + "logits/rejected": 3.7109375, + "logps/chosen": -1100.0, + "logps/rejected": -755.0, + "loss": 0.4123, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.294921875, + "rewards/margins": 6.0625, + "rewards/rejected": -3.78125, + "step": 1197 + }, + { + "epoch": 0.23771020387916067, + "grad_norm": 31.986962171709045, + "learning_rate": 9.491548749301997e-07, + "logits/chosen": 4.44140625, + "logits/rejected": 4.55078125, + "logps/chosen": -997.0, + "logps/rejected": -690.0, + "loss": 0.3319, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3740234375, + "rewards/margins": 6.4140625, + "rewards/rejected": -4.04296875, + "step": 1198 + }, + { + "epoch": 0.2379086264199613, + "grad_norm": 31.411774871568024, + "learning_rate": 9.490108678933557e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 4.01171875, + "logps/chosen": -1095.0, + "logps/rejected": -667.0, + "loss": 0.4092, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.859375, + "rewards/margins": 6.57421875, + "rewards/rejected": -3.720703125, + "step": 1199 + }, + { + "epoch": 0.23810704896076193, + "grad_norm": 33.35141335276766, + "learning_rate": 9.488666694580862e-07, + "logits/chosen": 4.14453125, + "logits/rejected": 4.2421875, + "logps/chosen": -964.0, + "logps/rejected": -655.5, + "loss": 0.3639, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.6875, + "rewards/margins": 6.2421875, + "rewards/rejected": -4.5546875, + "step": 1200 + }, + { + "epoch": 0.2383054715015626, + "grad_norm": 38.171974362535494, + "learning_rate": 9.487222796935605e-07, + "logits/chosen": 3.79296875, + "logits/rejected": 3.44921875, + "logps/chosen": -1434.0, + "logps/rejected": -808.0, + "loss": 0.3067, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.53125, + "rewards/margins": 7.4921875, + "rewards/rejected": -4.95703125, + "step": 1201 + }, + { + "epoch": 0.23850389404236322, + "grad_norm": 36.282933349916355, + "learning_rate": 9.485776986690397e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 3.87890625, + "logps/chosen": -1174.0, + "logps/rejected": -666.0, + "loss": 0.3628, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.9296875, + "rewards/margins": 6.328125, + "rewards/rejected": -4.39453125, + "step": 1202 + }, + { + "epoch": 0.23870231658316385, + "grad_norm": 32.22750515775737, + "learning_rate": 9.484329264538771e-07, + "logits/chosen": 4.4140625, + "logits/rejected": 4.46875, + "logps/chosen": -960.0, + "logps/rejected": -737.0, + "loss": 0.4524, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.96533203125, + "rewards/margins": 6.6015625, + "rewards/rejected": -5.625, + "step": 1203 + }, + { + "epoch": 0.23890073912396448, + "grad_norm": 35.30848831822494, + "learning_rate": 9.48287963117517e-07, + "logits/chosen": 4.390625, + "logits/rejected": 4.2421875, + "logps/chosen": -1230.0, + "logps/rejected": -699.0, + "loss": 0.4164, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.07421875, + "rewards/margins": 5.19921875, + "rewards/rejected": -3.11328125, + "step": 1204 + }, + { + "epoch": 0.2390991616647651, + "grad_norm": 34.38991327254115, + "learning_rate": 9.481428087294959e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.3203125, + "logps/chosen": -859.0, + "logps/rejected": -711.5, + "loss": 0.465, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.45703125, + "rewards/margins": 6.0703125, + "rewards/rejected": -4.609375, + "step": 1205 + }, + { + "epoch": 0.23929758420556574, + "grad_norm": 35.12438173102964, + "learning_rate": 9.479974633594419e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.15625, + "logps/chosen": -937.0, + "logps/rejected": -749.0, + "loss": 0.3987, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7734375, + "rewards/margins": 7.6953125, + "rewards/rejected": -4.921875, + "step": 1206 + }, + { + "epoch": 0.2394960067463664, + "grad_norm": 35.23674627222789, + "learning_rate": 9.478519270770744e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.26171875, + "logps/chosen": -929.0, + "logps/rejected": -672.0, + "loss": 0.374, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.64453125, + "rewards/margins": 5.6171875, + "rewards/rejected": -3.97265625, + "step": 1207 + }, + { + "epoch": 0.23969442928716703, + "grad_norm": 31.860486798708116, + "learning_rate": 9.477061999522046e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.16015625, + "logps/chosen": -1247.0, + "logps/rejected": -857.5, + "loss": 0.2895, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.279296875, + "rewards/margins": 8.0234375, + "rewards/rejected": -5.7421875, + "step": 1208 + }, + { + "epoch": 0.23989285182796766, + "grad_norm": 38.109364456049974, + "learning_rate": 9.475602820547352e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 4.08984375, + "logps/chosen": -972.0, + "logps/rejected": -749.0, + "loss": 0.541, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.744140625, + "rewards/margins": 9.34765625, + "rewards/rejected": -7.58984375, + "step": 1209 + }, + { + "epoch": 0.2400912743687683, + "grad_norm": 35.54419179413371, + "learning_rate": 9.474141734546606e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 4.4296875, + "logps/chosen": -642.5, + "logps/rejected": -1295.5, + "loss": 0.5203, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.4794921875, + "rewards/margins": 7.2734375, + "rewards/rejected": -6.796875, + "step": 1210 + }, + { + "epoch": 0.24028969690956892, + "grad_norm": 31.61363375363232, + "learning_rate": 9.472678742220664e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 3.94140625, + "logps/chosen": -1135.0, + "logps/rejected": -757.5, + "loss": 0.5208, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.330078125, + "rewards/margins": 5.16796875, + "rewards/rejected": -3.83203125, + "step": 1211 + }, + { + "epoch": 0.24048811945036958, + "grad_norm": 35.540344033878114, + "learning_rate": 9.471213844271297e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.2890625, + "logps/chosen": -976.0, + "logps/rejected": -611.5, + "loss": 0.4298, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.361328125, + "rewards/margins": 4.640625, + "rewards/rejected": -3.28125, + "step": 1212 + }, + { + "epoch": 0.2406865419911702, + "grad_norm": 34.61688112497375, + "learning_rate": 9.469747041401189e-07, + "logits/chosen": 3.52734375, + "logits/rejected": 3.734375, + "logps/chosen": -880.0, + "logps/rejected": -688.0, + "loss": 0.5241, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.212890625, + "rewards/margins": 4.46484375, + "rewards/rejected": -3.255859375, + "step": 1213 + }, + { + "epoch": 0.24088496453197084, + "grad_norm": 31.577761430612256, + "learning_rate": 9.46827833431394e-07, + "logits/chosen": 4.5625, + "logits/rejected": 4.4296875, + "logps/chosen": -1326.0, + "logps/rejected": -761.5, + "loss": 0.3874, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.50390625, + "rewards/margins": 6.3828125, + "rewards/rejected": -3.8828125, + "step": 1214 + }, + { + "epoch": 0.24108338707277147, + "grad_norm": 33.739359740823744, + "learning_rate": 9.466807723714067e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 3.8828125, + "logps/chosen": -1040.0, + "logps/rejected": -856.0, + "loss": 0.4692, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.841796875, + "rewards/margins": 4.91796875, + "rewards/rejected": -3.08203125, + "step": 1215 + }, + { + "epoch": 0.2412818096135721, + "grad_norm": 34.058536584298345, + "learning_rate": 9.465335210306989e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 3.984375, + "logps/chosen": -1206.0, + "logps/rejected": -891.0, + "loss": 0.4619, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.307373046875, + "rewards/margins": 4.64453125, + "rewards/rejected": -3.328125, + "step": 1216 + }, + { + "epoch": 0.24148023215437273, + "grad_norm": 34.43527860310711, + "learning_rate": 9.463860794799051e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 4.1015625, + "logps/chosen": -855.0, + "logps/rejected": -529.5, + "loss": 0.4586, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.759765625, + "rewards/margins": 4.51171875, + "rewards/rejected": -2.75, + "step": 1217 + }, + { + "epoch": 0.24167865469517338, + "grad_norm": 34.204191193880504, + "learning_rate": 9.4623844778975e-07, + "logits/chosen": 3.51171875, + "logits/rejected": 3.84375, + "logps/chosen": -1148.0, + "logps/rejected": -749.0, + "loss": 0.4442, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.927734375, + "rewards/margins": 5.21484375, + "rewards/rejected": -3.28515625, + "step": 1218 + }, + { + "epoch": 0.24187707723597401, + "grad_norm": 32.20186002399724, + "learning_rate": 9.460906260310499e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.12109375, + "logps/chosen": -894.0, + "logps/rejected": -547.5, + "loss": 0.4092, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.388671875, + "rewards/margins": 5.421875, + "rewards/rejected": -3.02734375, + "step": 1219 + }, + { + "epoch": 0.24207549977677464, + "grad_norm": 39.78533336614523, + "learning_rate": 9.459426142747125e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.98828125, + "logps/chosen": -733.0, + "logps/rejected": -725.5, + "loss": 0.5381, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.07080078125, + "rewards/margins": 14.8125, + "rewards/rejected": -13.7421875, + "step": 1220 + }, + { + "epoch": 0.24227392231757527, + "grad_norm": 34.54880525117105, + "learning_rate": 9.457944125917361e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.30859375, + "logps/chosen": -966.0, + "logps/rejected": -645.5, + "loss": 0.3575, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.208984375, + "rewards/margins": 6.2265625, + "rewards/rejected": -4.02734375, + "step": 1221 + }, + { + "epoch": 0.2424723448583759, + "grad_norm": 31.125575653791135, + "learning_rate": 9.456460210532108e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 4.02734375, + "logps/chosen": -988.0, + "logps/rejected": -918.0, + "loss": 0.5342, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.572265625, + "rewards/margins": 12.9453125, + "rewards/rejected": -11.38671875, + "step": 1222 + }, + { + "epoch": 0.24267076739917653, + "grad_norm": 44.04067699123653, + "learning_rate": 9.454974397303169e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 4.2265625, + "logps/chosen": -799.0, + "logps/rejected": -1246.0, + "loss": 0.51, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1328125, + "rewards/margins": 6.9453125, + "rewards/rejected": -5.8125, + "step": 1223 + }, + { + "epoch": 0.2428691899399772, + "grad_norm": 38.36782876926571, + "learning_rate": 9.453486686943267e-07, + "logits/chosen": 3.68359375, + "logits/rejected": 3.5078125, + "logps/chosen": -769.5, + "logps/rejected": -624.5, + "loss": 0.4439, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.41015625, + "rewards/margins": 5.0859375, + "rewards/rejected": -3.67578125, + "step": 1224 + }, + { + "epoch": 0.24306761248077782, + "grad_norm": 40.06044984247322, + "learning_rate": 9.451997080166028e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 4.12109375, + "logps/chosen": -1051.0, + "logps/rejected": -665.0, + "loss": 0.4929, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8992919921875, + "rewards/margins": 4.677734375, + "rewards/rejected": -3.78125, + "step": 1225 + }, + { + "epoch": 0.24326603502157845, + "grad_norm": 37.97076378194251, + "learning_rate": 9.45050557768599e-07, + "logits/chosen": 4.31640625, + "logits/rejected": 4.29296875, + "logps/chosen": -1160.0, + "logps/rejected": -776.0, + "loss": 0.5367, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.1611328125, + "rewards/margins": 5.4609375, + "rewards/rejected": -3.3125, + "step": 1226 + }, + { + "epoch": 0.24346445756237908, + "grad_norm": 42.80318668555057, + "learning_rate": 9.449012180218598e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 3.87890625, + "logps/chosen": -998.0, + "logps/rejected": -698.0, + "loss": 0.4554, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.546875, + "rewards/margins": 6.2265625, + "rewards/rejected": -4.6875, + "step": 1227 + }, + { + "epoch": 0.2436628801031797, + "grad_norm": 50.090156836001086, + "learning_rate": 9.447516888480214e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.3359375, + "logps/chosen": -927.0, + "logps/rejected": -834.0, + "loss": 0.4761, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.30322265625, + "rewards/margins": 8.9453125, + "rewards/rejected": -7.64453125, + "step": 1228 + }, + { + "epoch": 0.24386130264398034, + "grad_norm": 34.053290498941045, + "learning_rate": 9.446019703188098e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.3671875, + "logps/chosen": -842.0, + "logps/rejected": -681.0, + "loss": 0.4194, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.333984375, + "rewards/margins": 6.578125, + "rewards/rejected": -4.24609375, + "step": 1229 + }, + { + "epoch": 0.244059725184781, + "grad_norm": 33.5975320272068, + "learning_rate": 9.444520625060423e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 4.10546875, + "logps/chosen": -1087.0, + "logps/rejected": -757.5, + "loss": 0.5011, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.3681640625, + "rewards/margins": 5.1328125, + "rewards/rejected": -3.763671875, + "step": 1230 + }, + { + "epoch": 0.24425814772558163, + "grad_norm": 33.9839235100804, + "learning_rate": 9.443019654816274e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.40234375, + "logps/chosen": -774.0, + "logps/rejected": -774.0, + "loss": 0.4867, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.52734375, + "rewards/margins": 6.453125, + "rewards/rejected": -3.9296875, + "step": 1231 + }, + { + "epoch": 0.24445657026638226, + "grad_norm": 26.775040486223975, + "learning_rate": 9.441516793175637e-07, + "logits/chosen": 3.828125, + "logits/rejected": 3.8984375, + "logps/chosen": -623.0, + "logps/rejected": -467.75, + "loss": 0.5917, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.984375, + "rewards/margins": 3.8359375, + "rewards/rejected": -1.8515625, + "step": 1232 + }, + { + "epoch": 0.2446549928071829, + "grad_norm": 39.355831801702706, + "learning_rate": 9.440012040859408e-07, + "logits/chosen": 4.484375, + "logits/rejected": 4.56640625, + "logps/chosen": -1236.0, + "logps/rejected": -837.0, + "loss": 0.3984, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.53125, + "rewards/margins": 6.515625, + "rewards/rejected": -4.98046875, + "step": 1233 + }, + { + "epoch": 0.24485341534798352, + "grad_norm": 45.89650268719728, + "learning_rate": 9.438505398589392e-07, + "logits/chosen": 3.74609375, + "logits/rejected": 3.859375, + "logps/chosen": -825.0, + "logps/rejected": -581.5, + "loss": 0.6224, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.9892578125, + "rewards/margins": 4.13671875, + "rewards/rejected": -3.1484375, + "step": 1234 + }, + { + "epoch": 0.24505183788878418, + "grad_norm": 38.371454507591686, + "learning_rate": 9.436996867088294e-07, + "logits/chosen": 3.859375, + "logits/rejected": 4.00390625, + "logps/chosen": -957.0, + "logps/rejected": -585.5, + "loss": 0.4042, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.091796875, + "rewards/margins": 5.140625, + "rewards/rejected": -3.046875, + "step": 1235 + }, + { + "epoch": 0.2452502604295848, + "grad_norm": 37.79489490873024, + "learning_rate": 9.435486447079736e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.4921875, + "logps/chosen": -889.0, + "logps/rejected": -578.0, + "loss": 0.5662, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.45849609375, + "rewards/margins": 4.35546875, + "rewards/rejected": -3.890625, + "step": 1236 + }, + { + "epoch": 0.24544868297038544, + "grad_norm": 42.32605197126146, + "learning_rate": 9.433974139288235e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.40625, + "logps/chosen": -1429.0, + "logps/rejected": -920.0, + "loss": 0.3869, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.73828125, + "rewards/margins": 6.37890625, + "rewards/rejected": -3.623046875, + "step": 1237 + }, + { + "epoch": 0.24564710551118607, + "grad_norm": 32.34115035942022, + "learning_rate": 9.432459944439219e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 3.9296875, + "logps/chosen": -1023.0, + "logps/rejected": -687.0, + "loss": 0.3938, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.287109375, + "rewards/margins": 5.828125, + "rewards/rejected": -3.5390625, + "step": 1238 + }, + { + "epoch": 0.2458455280519867, + "grad_norm": 38.182332064230636, + "learning_rate": 9.430943863259024e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 4.0078125, + "logps/chosen": -891.0, + "logps/rejected": -726.0, + "loss": 0.4372, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0625, + "rewards/margins": 4.984375, + "rewards/rejected": -2.921875, + "step": 1239 + }, + { + "epoch": 0.24604395059278733, + "grad_norm": 45.24199858879473, + "learning_rate": 9.429425896474881e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.296875, + "logps/chosen": -1071.0, + "logps/rejected": -685.0, + "loss": 0.5317, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.7333984375, + "rewards/margins": 5.62890625, + "rewards/rejected": -3.90234375, + "step": 1240 + }, + { + "epoch": 0.246242373133588, + "grad_norm": 28.43637429821167, + "learning_rate": 9.427906044814935e-07, + "logits/chosen": 4.5390625, + "logits/rejected": 4.65625, + "logps/chosen": -798.0, + "logps/rejected": -595.5, + "loss": 0.6026, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.279296875, + "rewards/margins": 5.0625, + "rewards/rejected": -2.7890625, + "step": 1241 + }, + { + "epoch": 0.24644079567438862, + "grad_norm": 31.870955524473807, + "learning_rate": 9.426384309008233e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.73046875, + "logps/chosen": -1164.0, + "logps/rejected": -1407.0, + "loss": 0.5484, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.1328125, + "rewards/margins": 5.92578125, + "rewards/rejected": -3.79296875, + "step": 1242 + }, + { + "epoch": 0.24663921821518925, + "grad_norm": 34.977112186498275, + "learning_rate": 9.424860689784724e-07, + "logits/chosen": 3.921875, + "logits/rejected": 3.93359375, + "logps/chosen": -948.0, + "logps/rejected": -757.0, + "loss": 0.4589, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.787109375, + "rewards/margins": 6.125, + "rewards/rejected": -4.3359375, + "step": 1243 + }, + { + "epoch": 0.24683764075598988, + "grad_norm": 32.6210009881869, + "learning_rate": 9.423335187875259e-07, + "logits/chosen": 4.0625, + "logits/rejected": 4.4140625, + "logps/chosen": -1079.5, + "logps/rejected": -777.5, + "loss": 0.4639, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.4873046875, + "rewards/margins": 6.5859375, + "rewards/rejected": -5.08984375, + "step": 1244 + }, + { + "epoch": 0.2470360632967905, + "grad_norm": 32.52914693068329, + "learning_rate": 9.421807804011597e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.09765625, + "logps/chosen": -1297.0, + "logps/rejected": -805.0, + "loss": 0.4931, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.298828125, + "rewards/margins": 5.57421875, + "rewards/rejected": -3.26953125, + "step": 1245 + }, + { + "epoch": 0.24723448583759114, + "grad_norm": 50.886583071327706, + "learning_rate": 9.420278538926395e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.15625, + "logps/chosen": -1223.0, + "logps/rejected": -1128.0, + "loss": 0.399, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.544921875, + "rewards/margins": 7.53125, + "rewards/rejected": -5.9921875, + "step": 1246 + }, + { + "epoch": 0.2474329083783918, + "grad_norm": 42.581765105299446, + "learning_rate": 9.418747393353215e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.40625, + "logps/chosen": -1058.0, + "logps/rejected": -808.0, + "loss": 0.5133, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.0263671875, + "rewards/margins": 5.078125, + "rewards/rejected": -3.052734375, + "step": 1247 + }, + { + "epoch": 0.24763133091919243, + "grad_norm": 44.51128188214373, + "learning_rate": 9.417214368026519e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.0625, + "logps/chosen": -1127.0, + "logps/rejected": -884.0, + "loss": 0.3991, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.35546875, + "rewards/margins": 6.5390625, + "rewards/rejected": -4.1875, + "step": 1248 + }, + { + "epoch": 0.24782975345999306, + "grad_norm": 33.73497790379063, + "learning_rate": 9.415679463681675e-07, + "logits/chosen": 3.74609375, + "logits/rejected": 3.89453125, + "logps/chosen": -972.0, + "logps/rejected": -703.0, + "loss": 0.4431, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.990234375, + "rewards/margins": 6.0234375, + "rewards/rejected": -4.03515625, + "step": 1249 + }, + { + "epoch": 0.2480281760007937, + "grad_norm": 39.91229611830401, + "learning_rate": 9.414142681054945e-07, + "logits/chosen": 3.79296875, + "logits/rejected": 3.8671875, + "logps/chosen": -971.0, + "logps/rejected": -625.5, + "loss": 0.4845, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.5380859375, + "rewards/margins": 4.8671875, + "rewards/rejected": -3.328125, + "step": 1250 + }, + { + "epoch": 0.24822659854159432, + "grad_norm": 36.68396373167104, + "learning_rate": 9.412604020883501e-07, + "logits/chosen": 4.484375, + "logits/rejected": 4.2890625, + "logps/chosen": -1156.0, + "logps/rejected": -856.5, + "loss": 0.4739, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.16357421875, + "rewards/margins": 5.75, + "rewards/rejected": -3.59765625, + "step": 1251 + }, + { + "epoch": 0.24842502108239495, + "grad_norm": 37.342541610838474, + "learning_rate": 9.411063483905408e-07, + "logits/chosen": 3.9375, + "logits/rejected": 4.1875, + "logps/chosen": -886.0, + "logps/rejected": -1506.0, + "loss": 0.539, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.126953125, + "rewards/margins": 7.265625, + "rewards/rejected": -5.1484375, + "step": 1252 + }, + { + "epoch": 0.2486234436231956, + "grad_norm": 34.68043092370668, + "learning_rate": 9.409521070859637e-07, + "logits/chosen": 4.6328125, + "logits/rejected": 4.8046875, + "logps/chosen": -819.0, + "logps/rejected": -572.5, + "loss": 0.4144, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.244140625, + "rewards/margins": 4.90234375, + "rewards/rejected": -2.65625, + "step": 1253 + }, + { + "epoch": 0.24882186616399624, + "grad_norm": 36.28184524975767, + "learning_rate": 9.407976782486051e-07, + "logits/chosen": 4.26171875, + "logits/rejected": 4.36328125, + "logps/chosen": -849.0, + "logps/rejected": -469.5, + "loss": 0.4742, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.923828125, + "rewards/margins": 4.953125, + "rewards/rejected": -3.01953125, + "step": 1254 + }, + { + "epoch": 0.24902028870479687, + "grad_norm": 39.18601928896123, + "learning_rate": 9.406430619525423e-07, + "logits/chosen": 3.8828125, + "logits/rejected": 4.0625, + "logps/chosen": -978.0, + "logps/rejected": -972.0, + "loss": 0.4792, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.5556640625, + "rewards/margins": 5.6484375, + "rewards/rejected": -4.08984375, + "step": 1255 + }, + { + "epoch": 0.2492187112455975, + "grad_norm": 37.62469819367042, + "learning_rate": 9.404882582719417e-07, + "logits/chosen": 3.6328125, + "logits/rejected": 3.72265625, + "logps/chosen": -896.5, + "logps/rejected": -617.0, + "loss": 0.53, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.2412109375, + "rewards/margins": 5.28515625, + "rewards/rejected": -4.041015625, + "step": 1256 + }, + { + "epoch": 0.24941713378639813, + "grad_norm": 36.54681930395103, + "learning_rate": 9.4033326728106e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 3.96484375, + "logps/chosen": -1101.0, + "logps/rejected": -783.0, + "loss": 0.4249, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.044189453125, + "rewards/margins": 6.0859375, + "rewards/rejected": -4.03125, + "step": 1257 + }, + { + "epoch": 0.24961555632719876, + "grad_norm": 34.24709134546054, + "learning_rate": 9.401780890542437e-07, + "logits/chosen": 3.828125, + "logits/rejected": 3.83203125, + "logps/chosen": -1376.0, + "logps/rejected": -701.5, + "loss": 0.4588, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.9599609375, + "rewards/margins": 11.373046875, + "rewards/rejected": -10.40625, + "step": 1258 + }, + { + "epoch": 0.24981397886799941, + "grad_norm": 52.54706250504872, + "learning_rate": 9.400227236659288e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 4.27734375, + "logps/chosen": -1229.0, + "logps/rejected": -947.0, + "loss": 0.4252, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.9033203125, + "rewards/margins": 8.09375, + "rewards/rejected": -6.17578125, + "step": 1259 + }, + { + "epoch": 0.25001240140880004, + "grad_norm": 40.35580942960692, + "learning_rate": 9.398671711906414e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.3203125, + "logps/chosen": -1105.0, + "logps/rejected": -1574.0, + "loss": 0.4845, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.9306640625, + "rewards/margins": 8.81640625, + "rewards/rejected": -6.87109375, + "step": 1260 + }, + { + "epoch": 0.25021082394960065, + "grad_norm": 33.63997965232486, + "learning_rate": 9.397114317029974e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 3.86328125, + "logps/chosen": -1246.0, + "logps/rejected": -889.0, + "loss": 0.3189, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.33203125, + "rewards/margins": 8.4921875, + "rewards/rejected": -6.15625, + "step": 1261 + }, + { + "epoch": 0.2504092464904013, + "grad_norm": 52.960412915242095, + "learning_rate": 9.39555505277702e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.30078125, + "logps/chosen": -1132.5, + "logps/rejected": -703.5, + "loss": 0.4693, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.606201171875, + "rewards/margins": 6.359375, + "rewards/rejected": -4.74609375, + "step": 1262 + }, + { + "epoch": 0.25060766903120196, + "grad_norm": 33.6232750234766, + "learning_rate": 9.393993919895507e-07, + "logits/chosen": 3.984375, + "logits/rejected": 4.015625, + "logps/chosen": -831.0, + "logps/rejected": -578.0, + "loss": 0.5134, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.166015625, + "rewards/margins": 5.953125, + "rewards/rejected": -3.78515625, + "step": 1263 + }, + { + "epoch": 0.25080609157200257, + "grad_norm": 31.17275146797146, + "learning_rate": 9.392430919134279e-07, + "logits/chosen": 3.75, + "logits/rejected": 3.37109375, + "logps/chosen": -1594.0, + "logps/rejected": -608.5, + "loss": 0.526, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.193359375, + "rewards/margins": 3.48291015625, + "rewards/rejected": -3.287109375, + "step": 1264 + }, + { + "epoch": 0.2510045141128032, + "grad_norm": 49.07339881264742, + "learning_rate": 9.390866051243083e-07, + "logits/chosen": 3.8046875, + "logits/rejected": 3.9453125, + "logps/chosen": -1056.0, + "logps/rejected": -687.0, + "loss": 0.4244, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.1689453125, + "rewards/margins": 5.9296875, + "rewards/rejected": -4.7578125, + "step": 1265 + }, + { + "epoch": 0.2512029366536038, + "grad_norm": 35.192069300662695, + "learning_rate": 9.389299316972556e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 3.9453125, + "logps/chosen": -1204.0, + "logps/rejected": -757.0, + "loss": 0.397, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.03125, + "rewards/margins": 6.5703125, + "rewards/rejected": -4.54296875, + "step": 1266 + }, + { + "epoch": 0.2514013591944045, + "grad_norm": 37.42670339508042, + "learning_rate": 9.387730717074233e-07, + "logits/chosen": 4.46875, + "logits/rejected": 4.140625, + "logps/chosen": -1261.0, + "logps/rejected": -660.5, + "loss": 0.4437, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.3564453125, + "rewards/margins": 7.078125, + "rewards/rejected": -4.716796875, + "step": 1267 + }, + { + "epoch": 0.25159978173520514, + "grad_norm": 42.151916127940666, + "learning_rate": 9.386160252300544e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.09375, + "logps/chosen": -1000.0, + "logps/rejected": -690.5, + "loss": 0.3628, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.03125, + "rewards/margins": 6.8125, + "rewards/rejected": -4.78515625, + "step": 1268 + }, + { + "epoch": 0.25179820427600574, + "grad_norm": 36.99782466553618, + "learning_rate": 9.384587923404813e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 4.1796875, + "logps/chosen": -1005.0, + "logps/rejected": -765.5, + "loss": 0.3267, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.7578125, + "rewards/margins": 7.53125, + "rewards/rejected": -5.76171875, + "step": 1269 + }, + { + "epoch": 0.2519966268168064, + "grad_norm": 46.68270305674778, + "learning_rate": 9.383013731141258e-07, + "logits/chosen": 3.66015625, + "logits/rejected": 3.76171875, + "logps/chosen": -1056.0, + "logps/rejected": -799.0, + "loss": 0.4195, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.767578125, + "rewards/margins": 6.125, + "rewards/rejected": -4.357421875, + "step": 1270 + }, + { + "epoch": 0.252195049357607, + "grad_norm": 45.01231090733442, + "learning_rate": 9.381437676264991e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 4.21484375, + "logps/chosen": -982.0, + "logps/rejected": -658.5, + "loss": 0.387, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.5888671875, + "rewards/margins": 6.375, + "rewards/rejected": -4.78125, + "step": 1271 + }, + { + "epoch": 0.25239347189840766, + "grad_norm": 35.718609160786464, + "learning_rate": 9.379859759532019e-07, + "logits/chosen": 3.85546875, + "logits/rejected": 4.05859375, + "logps/chosen": -983.0, + "logps/rejected": -608.0, + "loss": 0.4931, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.666015625, + "rewards/margins": 5.984375, + "rewards/rejected": -4.31640625, + "step": 1272 + }, + { + "epoch": 0.2525918944392083, + "grad_norm": 36.2973241208669, + "learning_rate": 9.378279981699238e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 3.76953125, + "logps/chosen": -960.0, + "logps/rejected": -629.5, + "loss": 0.4352, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.7734375, + "rewards/margins": 6.0390625, + "rewards/rejected": -4.25, + "step": 1273 + }, + { + "epoch": 0.2527903169800089, + "grad_norm": 34.10895816443697, + "learning_rate": 9.37669834352444e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.84375, + "logps/chosen": -681.5, + "logps/rejected": -495.5, + "loss": 0.5899, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.6142578125, + "rewards/margins": 4.48828125, + "rewards/rejected": -2.880859375, + "step": 1274 + }, + { + "epoch": 0.2529887395208096, + "grad_norm": 41.896405074737736, + "learning_rate": 9.375114845766311e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.390625, + "logps/chosen": -1188.0, + "logps/rejected": -884.0, + "loss": 0.5474, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.313720703125, + "rewards/margins": 4.6484375, + "rewards/rejected": -3.328125, + "step": 1275 + }, + { + "epoch": 0.2531871620616102, + "grad_norm": 30.49285087612464, + "learning_rate": 9.373529489184424e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.265625, + "logps/chosen": -1282.0, + "logps/rejected": -911.0, + "loss": 0.3738, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.5087890625, + "rewards/margins": 6.1328125, + "rewards/rejected": -4.62109375, + "step": 1276 + }, + { + "epoch": 0.25338558460241084, + "grad_norm": 37.306209865514326, + "learning_rate": 9.37194227453925e-07, + "logits/chosen": 3.7265625, + "logits/rejected": 3.64453125, + "logps/chosen": -1195.0, + "logps/rejected": -825.0, + "loss": 0.4106, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.03515625, + "rewards/margins": 5.828125, + "rewards/rejected": -3.796875, + "step": 1277 + }, + { + "epoch": 0.25358400714321144, + "grad_norm": 36.564949597028665, + "learning_rate": 9.370353202592143e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.09375, + "logps/chosen": -821.0, + "logps/rejected": -730.5, + "loss": 0.5716, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.642578125, + "rewards/margins": 3.455078125, + "rewards/rejected": -1.818359375, + "step": 1278 + }, + { + "epoch": 0.2537824296840121, + "grad_norm": 33.90748958616373, + "learning_rate": 9.368762274105356e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 3.984375, + "logps/chosen": -718.0, + "logps/rejected": -575.5, + "loss": 0.5654, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.2783203125, + "rewards/margins": 4.3310546875, + "rewards/rejected": -3.05078125, + "step": 1279 + }, + { + "epoch": 0.25398085222481276, + "grad_norm": 47.10988455988521, + "learning_rate": 9.367169489842028e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.15625, + "logps/chosen": -1163.0, + "logps/rejected": -804.5, + "loss": 0.4373, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.1845703125, + "rewards/margins": 12.421875, + "rewards/rejected": -10.25390625, + "step": 1280 + }, + { + "epoch": 0.25417927476561336, + "grad_norm": 40.147411954812775, + "learning_rate": 9.36557485056619e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 3.99609375, + "logps/chosen": -1034.0, + "logps/rejected": -747.0, + "loss": 0.5094, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.9794921875, + "rewards/margins": 5.716796875, + "rewards/rejected": -3.734375, + "step": 1281 + }, + { + "epoch": 0.254377697306414, + "grad_norm": 33.14121946290807, + "learning_rate": 9.363978357042758e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 3.9609375, + "logps/chosen": -1038.0, + "logps/rejected": -1111.0, + "loss": 0.4504, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1982421875, + "rewards/margins": 5.2578125, + "rewards/rejected": -4.068359375, + "step": 1282 + }, + { + "epoch": 0.2545761198472146, + "grad_norm": 42.4228779605183, + "learning_rate": 9.362380010037548e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.0390625, + "logps/chosen": -939.0, + "logps/rejected": -762.0, + "loss": 0.5078, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.33721923828125, + "rewards/margins": 4.7890625, + "rewards/rejected": -3.4453125, + "step": 1283 + }, + { + "epoch": 0.2547745423880153, + "grad_norm": 32.42551832569098, + "learning_rate": 9.360779810317255e-07, + "logits/chosen": 3.68359375, + "logits/rejected": 3.84765625, + "logps/chosen": -887.0, + "logps/rejected": -736.5, + "loss": 0.4194, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.0, + "rewards/margins": 15.8046875, + "rewards/rejected": -13.73046875, + "step": 1284 + }, + { + "epoch": 0.25497296492881594, + "grad_norm": 28.66847578671024, + "learning_rate": 9.359177758649466e-07, + "logits/chosen": 4.35546875, + "logits/rejected": 4.453125, + "logps/chosen": -1408.0, + "logps/rejected": -1196.0, + "loss": 0.3204, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.0, + "rewards/margins": 9.265625, + "rewards/rejected": -7.25, + "step": 1285 + }, + { + "epoch": 0.25517138746961654, + "grad_norm": 40.73772187819649, + "learning_rate": 9.357573855802661e-07, + "logits/chosen": 4.38671875, + "logits/rejected": 4.1875, + "logps/chosen": -1037.0, + "logps/rejected": -842.0, + "loss": 0.4804, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.91796875, + "rewards/margins": 5.185546875, + "rewards/rejected": -4.265625, + "step": 1286 + }, + { + "epoch": 0.2553698100104172, + "grad_norm": 34.4715395379045, + "learning_rate": 9.355968102546198e-07, + "logits/chosen": 3.828125, + "logits/rejected": 3.8671875, + "logps/chosen": -976.0, + "logps/rejected": -708.0, + "loss": 0.3904, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.541015625, + "rewards/margins": 5.8984375, + "rewards/rejected": -4.359375, + "step": 1287 + }, + { + "epoch": 0.2555682325512178, + "grad_norm": 40.717020671652044, + "learning_rate": 9.354360499650331e-07, + "logits/chosen": 4.32421875, + "logits/rejected": 4.6015625, + "logps/chosen": -704.0, + "logps/rejected": -774.0, + "loss": 0.5472, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.43359375, + "rewards/margins": 4.89453125, + "rewards/rejected": -3.4638671875, + "step": 1288 + }, + { + "epoch": 0.25576665509201846, + "grad_norm": 38.27367448640431, + "learning_rate": 9.352751047886198e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.1328125, + "logps/chosen": -885.0, + "logps/rejected": -620.0, + "loss": 0.3756, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.240234375, + "rewards/margins": 6.0234375, + "rewards/rejected": -3.78125, + "step": 1289 + }, + { + "epoch": 0.2559650776328191, + "grad_norm": 34.67300555705849, + "learning_rate": 9.351139748025826e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.609375, + "logps/chosen": -817.0, + "logps/rejected": -604.5, + "loss": 0.5352, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.4326171875, + "rewards/margins": 5.046875, + "rewards/rejected": -3.609375, + "step": 1290 + }, + { + "epoch": 0.2561635001736197, + "grad_norm": 37.33983734406701, + "learning_rate": 9.349526600842126e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.26953125, + "logps/chosen": -793.0, + "logps/rejected": -887.0, + "loss": 0.527, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.84765625, + "rewards/margins": 5.953125, + "rewards/rejected": -5.10546875, + "step": 1291 + }, + { + "epoch": 0.2563619227144204, + "grad_norm": 32.4813130516123, + "learning_rate": 9.347911607108894e-07, + "logits/chosen": 3.875, + "logits/rejected": 4.03125, + "logps/chosen": -1032.0, + "logps/rejected": -808.5, + "loss": 0.4641, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.166015625, + "rewards/margins": 14.59375, + "rewards/rejected": -12.47265625, + "step": 1292 + }, + { + "epoch": 0.256560345255221, + "grad_norm": 38.55686192155967, + "learning_rate": 9.346294767600817e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 4.0390625, + "logps/chosen": -786.0, + "logps/rejected": -617.0, + "loss": 0.4965, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.435546875, + "rewards/margins": 5.89453125, + "rewards/rejected": -4.4609375, + "step": 1293 + }, + { + "epoch": 0.25675876779602164, + "grad_norm": 37.76035272781032, + "learning_rate": 9.344676083093462e-07, + "logits/chosen": 4.4375, + "logits/rejected": 4.375, + "logps/chosen": -1928.0, + "logps/rejected": -1083.0, + "loss": 0.5078, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.21875, + "rewards/margins": 4.98828125, + "rewards/rejected": -4.76953125, + "step": 1294 + }, + { + "epoch": 0.25695719033682224, + "grad_norm": 35.94842488826791, + "learning_rate": 9.343055554363285e-07, + "logits/chosen": 3.7265625, + "logits/rejected": 3.75, + "logps/chosen": -832.0, + "logps/rejected": -619.0, + "loss": 0.5318, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.87158203125, + "rewards/margins": 4.7734375, + "rewards/rejected": -3.90234375, + "step": 1295 + }, + { + "epoch": 0.2571556128776229, + "grad_norm": 35.15258421411601, + "learning_rate": 9.341433182187623e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.3671875, + "logps/chosen": -1133.0, + "logps/rejected": -672.0, + "loss": 0.4226, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.306640625, + "rewards/margins": 5.81640625, + "rewards/rejected": -3.515625, + "step": 1296 + }, + { + "epoch": 0.25735403541842355, + "grad_norm": 34.914700654240434, + "learning_rate": 9.3398089673447e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 3.91796875, + "logps/chosen": -1130.0, + "logps/rejected": -878.0, + "loss": 0.3236, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.51171875, + "rewards/margins": 7.734375, + "rewards/rejected": -5.234375, + "step": 1297 + }, + { + "epoch": 0.25755245795922416, + "grad_norm": 30.339449931063974, + "learning_rate": 9.338182910613623e-07, + "logits/chosen": 4.28515625, + "logits/rejected": 4.453125, + "logps/chosen": -856.0, + "logps/rejected": -630.0, + "loss": 0.3521, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4296875, + "rewards/margins": 7.6875, + "rewards/rejected": -5.2578125, + "step": 1298 + }, + { + "epoch": 0.2577508805000248, + "grad_norm": 34.118994670466606, + "learning_rate": 9.336555012774383e-07, + "logits/chosen": 4.03125, + "logits/rejected": 3.74609375, + "logps/chosen": -921.0, + "logps/rejected": -763.0, + "loss": 0.5065, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.4814453125, + "rewards/margins": 5.1953125, + "rewards/rejected": -3.7109375, + "step": 1299 + }, + { + "epoch": 0.2579493030408254, + "grad_norm": 33.6059145358184, + "learning_rate": 9.334925274607852e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 4.2734375, + "logps/chosen": -1428.0, + "logps/rejected": -744.0, + "loss": 0.384, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.49609375, + "rewards/margins": 7.140625, + "rewards/rejected": -4.64453125, + "step": 1300 + }, + { + "epoch": 0.2581477255816261, + "grad_norm": 35.22903568054287, + "learning_rate": 9.333293696895789e-07, + "logits/chosen": 3.8515625, + "logits/rejected": 3.80078125, + "logps/chosen": -1156.0, + "logps/rejected": -775.0, + "loss": 0.3967, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.31640625, + "rewards/margins": 7.359375, + "rewards/rejected": -5.05078125, + "step": 1301 + }, + { + "epoch": 0.25834614812242673, + "grad_norm": 41.01718555747509, + "learning_rate": 9.33166028042083e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.38671875, + "logps/chosen": -1124.0, + "logps/rejected": -959.5, + "loss": 0.3933, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.18359375, + "rewards/margins": 7.171875, + "rewards/rejected": -4.9921875, + "step": 1302 + }, + { + "epoch": 0.25854457066322734, + "grad_norm": 38.066401358631694, + "learning_rate": 9.330025025966498e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 4.03125, + "logps/chosen": -1281.0, + "logps/rejected": -817.0, + "loss": 0.39, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.138671875, + "rewards/margins": 7.7265625, + "rewards/rejected": -5.59765625, + "step": 1303 + }, + { + "epoch": 0.258742993204028, + "grad_norm": 30.821353745241737, + "learning_rate": 9.328387934317196e-07, + "logits/chosen": 3.7265625, + "logits/rejected": 3.90234375, + "logps/chosen": -1243.0, + "logps/rejected": -2403.5, + "loss": 0.5696, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.2275390625, + "rewards/margins": 9.109375, + "rewards/rejected": -7.875, + "step": 1304 + }, + { + "epoch": 0.2589414157448286, + "grad_norm": 37.692494215824425, + "learning_rate": 9.326749006258204e-07, + "logits/chosen": 3.78125, + "logits/rejected": 3.796875, + "logps/chosen": -1226.0, + "logps/rejected": -927.0, + "loss": 0.414, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.1123046875, + "rewards/margins": 7.546875, + "rewards/rejected": -5.4375, + "step": 1305 + }, + { + "epoch": 0.25913983828562925, + "grad_norm": 29.575098026716905, + "learning_rate": 9.325108242575691e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.265625, + "logps/chosen": -1276.0, + "logps/rejected": -1240.0, + "loss": 0.3196, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3515625, + "rewards/margins": 9.9296875, + "rewards/rejected": -7.578125, + "step": 1306 + }, + { + "epoch": 0.25933826082642986, + "grad_norm": 34.18851602828729, + "learning_rate": 9.3234656440567e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 4.0078125, + "logps/chosen": -1223.0, + "logps/rejected": -953.0, + "loss": 0.4556, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.7734375, + "rewards/margins": 6.8125, + "rewards/rejected": -5.0390625, + "step": 1307 + }, + { + "epoch": 0.2595366833672305, + "grad_norm": 38.68127144918661, + "learning_rate": 9.321821211489158e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 3.48828125, + "logps/chosen": -1156.0, + "logps/rejected": -690.0, + "loss": 0.3786, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.001953125, + "rewards/margins": 5.86328125, + "rewards/rejected": -3.87109375, + "step": 1308 + }, + { + "epoch": 0.25973510590803117, + "grad_norm": 38.38754841601856, + "learning_rate": 9.320174945661869e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 3.890625, + "logps/chosen": -744.0, + "logps/rejected": -534.0, + "loss": 0.4944, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.5947265625, + "rewards/margins": 5.12890625, + "rewards/rejected": -3.51953125, + "step": 1309 + }, + { + "epoch": 0.2599335284488318, + "grad_norm": 40.22441745065487, + "learning_rate": 9.318526847364515e-07, + "logits/chosen": 3.96875, + "logits/rejected": 3.99609375, + "logps/chosen": -1160.0, + "logps/rejected": -643.5, + "loss": 0.5267, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.125, + "rewards/margins": 3.51171875, + "rewards/rejected": -2.38671875, + "step": 1310 + }, + { + "epoch": 0.26013195098963243, + "grad_norm": 38.38395686876909, + "learning_rate": 9.316876917387664e-07, + "logits/chosen": 3.59375, + "logits/rejected": 3.890625, + "logps/chosen": -1042.0, + "logps/rejected": -839.0, + "loss": 0.5733, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.583984375, + "rewards/margins": 5.81640625, + "rewards/rejected": -4.23828125, + "step": 1311 + }, + { + "epoch": 0.26033037353043303, + "grad_norm": 38.68373264455986, + "learning_rate": 9.315225156522754e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 3.94921875, + "logps/chosen": -804.0, + "logps/rejected": -629.5, + "loss": 0.4882, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.875, + "rewards/margins": 4.859375, + "rewards/rejected": -2.97265625, + "step": 1312 + }, + { + "epoch": 0.2605287960712337, + "grad_norm": 35.09991836559879, + "learning_rate": 9.31357156556211e-07, + "logits/chosen": 3.70703125, + "logits/rejected": 3.7421875, + "logps/chosen": -1248.0, + "logps/rejected": -831.0, + "loss": 0.3917, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.41015625, + "rewards/margins": 6.59375, + "rewards/rejected": -4.16796875, + "step": 1313 + }, + { + "epoch": 0.26072721861203435, + "grad_norm": 41.36477141934642, + "learning_rate": 9.311916145298925e-07, + "logits/chosen": 4.484375, + "logits/rejected": 4.515625, + "logps/chosen": -1090.0, + "logps/rejected": -782.0, + "loss": 0.4904, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.443359375, + "rewards/margins": 8.0703125, + "rewards/rejected": -6.6171875, + "step": 1314 + }, + { + "epoch": 0.26092564115283495, + "grad_norm": 42.77842233248467, + "learning_rate": 9.310258896527278e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 3.94921875, + "logps/chosen": -1288.0, + "logps/rejected": -788.0, + "loss": 0.4669, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.962890625, + "rewards/margins": 5.25, + "rewards/rejected": -4.2890625, + "step": 1315 + }, + { + "epoch": 0.2611240636936356, + "grad_norm": 32.47882722408595, + "learning_rate": 9.308599820042122e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 3.73046875, + "logps/chosen": -882.0, + "logps/rejected": -606.5, + "loss": 0.3524, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.01171875, + "rewards/margins": 6.15625, + "rewards/rejected": -4.14453125, + "step": 1316 + }, + { + "epoch": 0.2613224862344362, + "grad_norm": 44.78022486134136, + "learning_rate": 9.306938916639285e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.234375, + "logps/chosen": -1476.0, + "logps/rejected": -1081.0, + "loss": 0.4528, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.134765625, + "rewards/margins": 7.4140625, + "rewards/rejected": -5.26953125, + "step": 1317 + }, + { + "epoch": 0.26152090877523687, + "grad_norm": 28.238081525363526, + "learning_rate": 9.305276187115473e-07, + "logits/chosen": 4.31640625, + "logits/rejected": 4.19921875, + "logps/chosen": -898.25, + "logps/rejected": -509.5, + "loss": 0.3739, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.765625, + "rewards/margins": 6.69140625, + "rewards/rejected": -3.9296875, + "step": 1318 + }, + { + "epoch": 0.26171933131603753, + "grad_norm": 33.87214585642699, + "learning_rate": 9.303611632268268e-07, + "logits/chosen": 4.3984375, + "logits/rejected": 4.3515625, + "logps/chosen": -1121.0, + "logps/rejected": -767.5, + "loss": 0.5545, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.1842041015625, + "rewards/margins": 4.5732421875, + "rewards/rejected": -3.38671875, + "step": 1319 + }, + { + "epoch": 0.26191775385683813, + "grad_norm": 34.75012367953433, + "learning_rate": 9.301945252896127e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 4.28515625, + "logps/chosen": -703.0, + "logps/rejected": -684.0, + "loss": 0.5084, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.248046875, + "rewards/margins": 4.75390625, + "rewards/rejected": -3.5, + "step": 1320 + }, + { + "epoch": 0.2621161763976388, + "grad_norm": 35.25419339714888, + "learning_rate": 9.300277049798385e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.30859375, + "logps/chosen": -1176.0, + "logps/rejected": -1093.0, + "loss": 0.4415, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.38671875, + "rewards/margins": 7.65625, + "rewards/rejected": -5.26171875, + "step": 1321 + }, + { + "epoch": 0.2623145989384394, + "grad_norm": 33.195029251605085, + "learning_rate": 9.298607023775246e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 3.8125, + "logps/chosen": -1202.0, + "logps/rejected": -692.0, + "loss": 0.3663, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.52734375, + "rewards/margins": 6.3671875, + "rewards/rejected": -3.8359375, + "step": 1322 + }, + { + "epoch": 0.26251302147924005, + "grad_norm": 26.438636027723952, + "learning_rate": 9.296935175627794e-07, + "logits/chosen": 4.41796875, + "logits/rejected": 4.3828125, + "logps/chosen": -897.0, + "logps/rejected": -564.5, + "loss": 0.4054, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.611328125, + "rewards/margins": 5.595703125, + "rewards/rejected": -3.97265625, + "step": 1323 + }, + { + "epoch": 0.26271144402004065, + "grad_norm": 35.9798815892037, + "learning_rate": 9.295261506157985e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 3.89453125, + "logps/chosen": -706.0, + "logps/rejected": -491.0, + "loss": 0.5629, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.474609375, + "rewards/margins": 4.4921875, + "rewards/rejected": -3.015625, + "step": 1324 + }, + { + "epoch": 0.2629098665608413, + "grad_norm": 31.549743685820882, + "learning_rate": 9.293586016168648e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.2578125, + "logps/chosen": -696.0, + "logps/rejected": -557.0, + "loss": 0.3426, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.861328125, + "rewards/margins": 6.515625, + "rewards/rejected": -4.66015625, + "step": 1325 + }, + { + "epoch": 0.26310828910164197, + "grad_norm": 43.00804557321599, + "learning_rate": 9.291908706463483e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 3.9140625, + "logps/chosen": -1123.0, + "logps/rejected": -882.0, + "loss": 0.505, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.130859375, + "rewards/margins": 6.0390625, + "rewards/rejected": -4.91015625, + "step": 1326 + }, + { + "epoch": 0.26330671164244257, + "grad_norm": 28.91623365218946, + "learning_rate": 9.290229577847072e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.4921875, + "logps/chosen": -1262.0, + "logps/rejected": -1794.5, + "loss": 0.3541, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.064453125, + "rewards/margins": 9.0859375, + "rewards/rejected": -7.0234375, + "step": 1327 + }, + { + "epoch": 0.26350513418324323, + "grad_norm": 40.16620332883976, + "learning_rate": 9.288548631124856e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.13671875, + "logps/chosen": -1219.0, + "logps/rejected": -1165.0, + "loss": 0.3998, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0859375, + "rewards/margins": 6.71875, + "rewards/rejected": -4.62890625, + "step": 1328 + }, + { + "epoch": 0.26370355672404383, + "grad_norm": 38.9861819924333, + "learning_rate": 9.286865867103159e-07, + "logits/chosen": 3.85546875, + "logits/rejected": 3.91015625, + "logps/chosen": -999.0, + "logps/rejected": -650.5, + "loss": 0.3906, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.52734375, + "rewards/margins": 6.6953125, + "rewards/rejected": -4.16796875, + "step": 1329 + }, + { + "epoch": 0.2639019792648445, + "grad_norm": 38.34845684259244, + "learning_rate": 9.285181286589175e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.47265625, + "logps/chosen": -988.0, + "logps/rejected": -689.5, + "loss": 0.6092, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.62890625, + "rewards/margins": 5.44921875, + "rewards/rejected": -3.814453125, + "step": 1330 + }, + { + "epoch": 0.26410040180564515, + "grad_norm": 34.91799456273551, + "learning_rate": 9.283494890390964e-07, + "logits/chosen": 3.5703125, + "logits/rejected": 4.19921875, + "logps/chosen": -1000.0, + "logps/rejected": -916.0, + "loss": 0.464, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.404541015625, + "rewards/margins": 6.171875, + "rewards/rejected": -4.76171875, + "step": 1331 + }, + { + "epoch": 0.26429882434644575, + "grad_norm": 33.85752501999303, + "learning_rate": 9.281806679317461e-07, + "logits/chosen": 3.55078125, + "logits/rejected": 3.98046875, + "logps/chosen": -590.5, + "logps/rejected": -895.5, + "loss": 0.4605, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.393310546875, + "rewards/margins": 7.6875, + "rewards/rejected": -6.28125, + "step": 1332 + }, + { + "epoch": 0.2644972468872464, + "grad_norm": 34.33617742067898, + "learning_rate": 9.280116654178472e-07, + "logits/chosen": 3.7265625, + "logits/rejected": 3.6640625, + "logps/chosen": -920.0, + "logps/rejected": -668.0, + "loss": 0.3877, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.162109375, + "rewards/margins": 13.70703125, + "rewards/rejected": -11.59765625, + "step": 1333 + }, + { + "epoch": 0.264695669428047, + "grad_norm": 50.31073049311515, + "learning_rate": 9.27842481578467e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.75390625, + "logps/chosen": -1099.0, + "logps/rejected": -768.0, + "loss": 0.431, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.08203125, + "rewards/margins": 5.6171875, + "rewards/rejected": -3.53125, + "step": 1334 + }, + { + "epoch": 0.26489409196884767, + "grad_norm": 39.882652750566955, + "learning_rate": 9.276731164947602e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 3.85546875, + "logps/chosen": -763.0, + "logps/rejected": -611.5, + "loss": 0.4522, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.80078125, + "rewards/margins": 4.7109375, + "rewards/rejected": -2.921875, + "step": 1335 + }, + { + "epoch": 0.26509251450964827, + "grad_norm": 39.608414482281525, + "learning_rate": 9.275035702479681e-07, + "logits/chosen": 3.76953125, + "logits/rejected": 3.76953125, + "logps/chosen": -1514.0, + "logps/rejected": -699.0, + "loss": 0.5231, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.1953125, + "rewards/margins": 2.779296875, + "rewards/rejected": -2.9765625, + "step": 1336 + }, + { + "epoch": 0.2652909370504489, + "grad_norm": 37.486043807515074, + "learning_rate": 9.273338429194189e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 3.99609375, + "logps/chosen": -1165.0, + "logps/rejected": -856.5, + "loss": 0.4108, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.7900390625, + "rewards/margins": 5.9765625, + "rewards/rejected": -4.1953125, + "step": 1337 + }, + { + "epoch": 0.2654893595912496, + "grad_norm": 40.79849431803008, + "learning_rate": 9.271639345905282e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.1015625, + "logps/chosen": -715.0, + "logps/rejected": -521.0, + "loss": 0.4487, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.630859375, + "rewards/margins": 4.921875, + "rewards/rejected": -3.28125, + "step": 1338 + }, + { + "epoch": 0.2656877821320502, + "grad_norm": 38.15381472395473, + "learning_rate": 9.269938453427975e-07, + "logits/chosen": 3.6484375, + "logits/rejected": 3.77734375, + "logps/chosen": -1075.0, + "logps/rejected": -693.0, + "loss": 0.4251, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.61328125, + "rewards/margins": 6.4921875, + "rewards/rejected": -4.890625, + "step": 1339 + }, + { + "epoch": 0.26588620467285085, + "grad_norm": 40.46224291646246, + "learning_rate": 9.26823575257816e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.2109375, + "logps/chosen": -1071.0, + "logps/rejected": -1450.0, + "loss": 0.4871, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.478515625, + "rewards/margins": 7.6796875, + "rewards/rejected": -6.1875, + "step": 1340 + }, + { + "epoch": 0.26608462721365145, + "grad_norm": 38.33821610463239, + "learning_rate": 9.26653124417259e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.6328125, + "logps/chosen": -1261.0, + "logps/rejected": -848.0, + "loss": 0.489, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.00830078125, + "rewards/margins": 5.0546875, + "rewards/rejected": -3.04296875, + "step": 1341 + }, + { + "epoch": 0.2662830497544521, + "grad_norm": 25.511929329110476, + "learning_rate": 9.264824929028888e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.421875, + "logps/chosen": -1258.0, + "logps/rejected": -956.0, + "loss": 0.3016, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.734375, + "rewards/margins": 8.796875, + "rewards/rejected": -6.0625, + "step": 1342 + }, + { + "epoch": 0.26648147229525276, + "grad_norm": 37.69072682683924, + "learning_rate": 9.263116807965543e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 3.7734375, + "logps/chosen": -854.0, + "logps/rejected": -615.5, + "loss": 0.4063, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.22412109375, + "rewards/margins": 6.6875, + "rewards/rejected": -5.4765625, + "step": 1343 + }, + { + "epoch": 0.26667989483605337, + "grad_norm": 28.57125218569516, + "learning_rate": 9.261406881801911e-07, + "logits/chosen": 3.765625, + "logits/rejected": 3.73828125, + "logps/chosen": -1156.0, + "logps/rejected": -665.5, + "loss": 0.3376, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.1484375, + "rewards/margins": 7.0703125, + "rewards/rejected": -4.9140625, + "step": 1344 + }, + { + "epoch": 0.266878317376854, + "grad_norm": 37.319264284676805, + "learning_rate": 9.259695151358214e-07, + "logits/chosen": 3.78515625, + "logits/rejected": 3.81640625, + "logps/chosen": -934.0, + "logps/rejected": -866.0, + "loss": 0.47, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.4111328125, + "rewards/margins": 7.04296875, + "rewards/rejected": -4.63525390625, + "step": 1345 + }, + { + "epoch": 0.2670767399176546, + "grad_norm": 35.99883597459367, + "learning_rate": 9.257981617455536e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.26953125, + "logps/chosen": -1179.0, + "logps/rejected": -961.0, + "loss": 0.4764, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.75, + "rewards/margins": 11.46875, + "rewards/rejected": -8.7109375, + "step": 1346 + }, + { + "epoch": 0.2672751624584553, + "grad_norm": 28.90261035043826, + "learning_rate": 9.25626628091583e-07, + "logits/chosen": 3.91015625, + "logits/rejected": 4.02734375, + "logps/chosen": -848.5, + "logps/rejected": -646.5, + "loss": 0.5244, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.27197265625, + "rewards/margins": 5.66796875, + "rewards/rejected": -4.3828125, + "step": 1347 + }, + { + "epoch": 0.26747358499925594, + "grad_norm": 33.341864511345804, + "learning_rate": 9.254549142561913e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 4.0390625, + "logps/chosen": -1174.0, + "logps/rejected": -788.0, + "loss": 0.4789, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.0625, + "rewards/margins": 5.78125, + "rewards/rejected": -3.728515625, + "step": 1348 + }, + { + "epoch": 0.26767200754005654, + "grad_norm": 42.08756455850682, + "learning_rate": 9.252830203217465e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.18359375, + "logps/chosen": -1238.0, + "logps/rejected": -637.0, + "loss": 0.3763, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.134765625, + "rewards/margins": 7.3359375, + "rewards/rejected": -5.20703125, + "step": 1349 + }, + { + "epoch": 0.2678704300808572, + "grad_norm": 43.48463639114228, + "learning_rate": 9.251109463707032e-07, + "logits/chosen": 4.38671875, + "logits/rejected": 4.62890625, + "logps/chosen": -952.0, + "logps/rejected": -1463.0, + "loss": 0.3615, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.92578125, + "rewards/margins": 9.140625, + "rewards/rejected": -7.22265625, + "step": 1350 + }, + { + "epoch": 0.2680688526216578, + "grad_norm": 44.11525028810542, + "learning_rate": 9.24938692485602e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.015625, + "logps/chosen": -1121.0, + "logps/rejected": -710.0, + "loss": 0.459, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.013671875, + "rewards/margins": 5.9765625, + "rewards/rejected": -3.9609375, + "step": 1351 + }, + { + "epoch": 0.26826727516245846, + "grad_norm": 31.338187954184942, + "learning_rate": 9.247662587490702e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 4.15234375, + "logps/chosen": -1306.0, + "logps/rejected": -902.0, + "loss": 0.3536, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.546875, + "rewards/margins": 7.8125, + "rewards/rejected": -5.25390625, + "step": 1352 + }, + { + "epoch": 0.26846569770325907, + "grad_norm": 50.47757206275344, + "learning_rate": 9.245936452438211e-07, + "logits/chosen": 3.72265625, + "logits/rejected": 3.74609375, + "logps/chosen": -1181.0, + "logps/rejected": -628.0, + "loss": 0.4501, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.234375, + "rewards/margins": 5.98828125, + "rewards/rejected": -3.74609375, + "step": 1353 + }, + { + "epoch": 0.2686641202440597, + "grad_norm": 45.900454045614886, + "learning_rate": 9.244208520526545e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 3.7890625, + "logps/chosen": -1149.0, + "logps/rejected": -766.0, + "loss": 0.4583, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.833984375, + "rewards/margins": 6.203125, + "rewards/rejected": -4.3671875, + "step": 1354 + }, + { + "epoch": 0.2688625427848604, + "grad_norm": 28.695757299879087, + "learning_rate": 9.242478792584561e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.171875, + "logps/chosen": -874.0, + "logps/rejected": -639.0, + "loss": 0.4613, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.41943359375, + "rewards/margins": 6.0390625, + "rewards/rejected": -4.6171875, + "step": 1355 + }, + { + "epoch": 0.269060965325661, + "grad_norm": 40.28619077873202, + "learning_rate": 9.240747269441978e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 4.23046875, + "logps/chosen": -883.5, + "logps/rejected": -635.0, + "loss": 0.3814, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.837890625, + "rewards/margins": 6.5, + "rewards/rejected": -4.671875, + "step": 1356 + }, + { + "epoch": 0.26925938786646164, + "grad_norm": 34.84507472964815, + "learning_rate": 9.239013951929377e-07, + "logits/chosen": 3.703125, + "logits/rejected": 3.92578125, + "logps/chosen": -1069.0, + "logps/rejected": -643.5, + "loss": 0.4629, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.90625, + "rewards/margins": 10.734375, + "rewards/rejected": -8.826171875, + "step": 1357 + }, + { + "epoch": 0.26945781040726224, + "grad_norm": 34.52112881095947, + "learning_rate": 9.237278840878204e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.140625, + "logps/chosen": -912.0, + "logps/rejected": -682.0, + "loss": 0.5479, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.666015625, + "rewards/margins": 5.53125, + "rewards/rejected": -3.85546875, + "step": 1358 + }, + { + "epoch": 0.2696562329480629, + "grad_norm": 35.91450396311356, + "learning_rate": 9.235541937120754e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 3.9609375, + "logps/chosen": -1107.0, + "logps/rejected": -823.0, + "loss": 0.5294, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.76171875, + "rewards/margins": 5.69140625, + "rewards/rejected": -3.93359375, + "step": 1359 + }, + { + "epoch": 0.26985465548886356, + "grad_norm": 38.85703907952623, + "learning_rate": 9.233803241490194e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 4.125, + "logps/chosen": -624.0, + "logps/rejected": -523.5, + "loss": 0.4418, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.32666015625, + "rewards/margins": 6.0078125, + "rewards/rejected": -4.671875, + "step": 1360 + }, + { + "epoch": 0.27005307802966416, + "grad_norm": 45.207466802716766, + "learning_rate": 9.232062754820542e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 4.2109375, + "logps/chosen": -1316.0, + "logps/rejected": -816.0, + "loss": 0.3655, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6484375, + "rewards/margins": 7.0625, + "rewards/rejected": -4.4140625, + "step": 1361 + }, + { + "epoch": 0.2702515005704648, + "grad_norm": 32.571040083181934, + "learning_rate": 9.230320477946682e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.6484375, + "logps/chosen": -1004.0, + "logps/rejected": -853.0, + "loss": 0.5295, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.3994140625, + "rewards/margins": 4.93359375, + "rewards/rejected": -3.53125, + "step": 1362 + }, + { + "epoch": 0.2704499231112654, + "grad_norm": 37.60809281619365, + "learning_rate": 9.22857641170435e-07, + "logits/chosen": 4.3515625, + "logits/rejected": 4.4453125, + "logps/chosen": -1129.0, + "logps/rejected": -918.5, + "loss": 0.3625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.953125, + "rewards/margins": 16.625, + "rewards/rejected": -13.70703125, + "step": 1363 + }, + { + "epoch": 0.2706483456520661, + "grad_norm": 31.048677522516726, + "learning_rate": 9.226830556930145e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.09375, + "logps/chosen": -955.0, + "logps/rejected": -669.0, + "loss": 0.4229, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4794921875, + "rewards/margins": 5.96484375, + "rewards/rejected": -3.49609375, + "step": 1364 + }, + { + "epoch": 0.2708467681928667, + "grad_norm": 31.50052933886999, + "learning_rate": 9.225082914461522e-07, + "logits/chosen": 4.125, + "logits/rejected": 3.89453125, + "logps/chosen": -912.0, + "logps/rejected": -444.0, + "loss": 0.5874, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.0908203125, + "rewards/margins": 3.75, + "rewards/rejected": -2.650390625, + "step": 1365 + }, + { + "epoch": 0.27104519073366734, + "grad_norm": 44.7908958332506, + "learning_rate": 9.223333485136796e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.0234375, + "logps/chosen": -838.5, + "logps/rejected": -562.0, + "loss": 0.428, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.384765625, + "rewards/margins": 5.22265625, + "rewards/rejected": -3.8359375, + "step": 1366 + }, + { + "epoch": 0.271243613274468, + "grad_norm": 33.44437990069318, + "learning_rate": 9.221582269795136e-07, + "logits/chosen": 4.453125, + "logits/rejected": 4.6484375, + "logps/chosen": -857.0, + "logps/rejected": -749.0, + "loss": 0.3927, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.02734375, + "rewards/margins": 6.015625, + "rewards/rejected": -3.99609375, + "step": 1367 + }, + { + "epoch": 0.2714420358152686, + "grad_norm": 41.50279635946243, + "learning_rate": 9.219829269276569e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 3.98046875, + "logps/chosen": -944.0, + "logps/rejected": -698.0, + "loss": 0.4474, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.658203125, + "rewards/margins": 6.03125, + "rewards/rejected": -4.375, + "step": 1368 + }, + { + "epoch": 0.27164045835606926, + "grad_norm": 31.763863469035762, + "learning_rate": 9.218074484421977e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 4.15234375, + "logps/chosen": -1139.0, + "logps/rejected": -1527.5, + "loss": 0.424, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.564453125, + "rewards/margins": 7.74609375, + "rewards/rejected": -5.17578125, + "step": 1369 + }, + { + "epoch": 0.27183888089686986, + "grad_norm": 37.3163902396622, + "learning_rate": 9.216317916073099e-07, + "logits/chosen": 3.79296875, + "logits/rejected": 3.859375, + "logps/chosen": -901.5, + "logps/rejected": -649.0, + "loss": 0.4913, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.9140625, + "rewards/margins": 4.89453125, + "rewards/rejected": -2.986328125, + "step": 1370 + }, + { + "epoch": 0.2720373034376705, + "grad_norm": 39.03822376347677, + "learning_rate": 9.214559565072532e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.1328125, + "logps/chosen": -1144.0, + "logps/rejected": -609.5, + "loss": 0.3837, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.904296875, + "rewards/margins": 6.92578125, + "rewards/rejected": -4.017578125, + "step": 1371 + }, + { + "epoch": 0.2722357259784712, + "grad_norm": 44.47698440932703, + "learning_rate": 9.212799432263725e-07, + "logits/chosen": 4.078125, + "logits/rejected": 4.1015625, + "logps/chosen": -911.0, + "logps/rejected": -683.0, + "loss": 0.4739, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.44140625, + "rewards/margins": 5.578125, + "rewards/rejected": -4.140625, + "step": 1372 + }, + { + "epoch": 0.2724341485192718, + "grad_norm": 30.57816692764687, + "learning_rate": 9.21103751849098e-07, + "logits/chosen": 3.8046875, + "logits/rejected": 3.83203125, + "logps/chosen": -723.5, + "logps/rejected": -521.0, + "loss": 0.4455, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.095703125, + "rewards/margins": 4.73046875, + "rewards/rejected": -2.6328125, + "step": 1373 + }, + { + "epoch": 0.27263257106007244, + "grad_norm": 26.626175765985185, + "learning_rate": 9.209273824599458e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 4.21875, + "logps/chosen": -1271.0, + "logps/rejected": -1025.0, + "loss": 0.2418, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.99609375, + "rewards/margins": 8.8515625, + "rewards/rejected": -5.859375, + "step": 1374 + }, + { + "epoch": 0.27283099360087304, + "grad_norm": 37.528666870883775, + "learning_rate": 9.207508351435171e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 3.9296875, + "logps/chosen": -872.5, + "logps/rejected": -590.0, + "loss": 0.5675, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.76171875, + "rewards/margins": 4.640625, + "rewards/rejected": -2.87890625, + "step": 1375 + }, + { + "epoch": 0.2730294161416737, + "grad_norm": 32.112779107296426, + "learning_rate": 9.205741099844987e-07, + "logits/chosen": 4.36328125, + "logits/rejected": 4.29296875, + "logps/chosen": -1190.0, + "logps/rejected": -681.5, + "loss": 0.4448, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.447265625, + "rewards/margins": 6.125, + "rewards/rejected": -3.67578125, + "step": 1376 + }, + { + "epoch": 0.27322783868247436, + "grad_norm": 37.11838171153414, + "learning_rate": 9.203972070676622e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.23828125, + "logps/chosen": -1118.0, + "logps/rejected": -813.5, + "loss": 0.3869, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.0, + "rewards/margins": 6.9765625, + "rewards/rejected": -4.96875, + "step": 1377 + }, + { + "epoch": 0.27342626122327496, + "grad_norm": 28.106297173071948, + "learning_rate": 9.202201264778649e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.453125, + "logps/chosen": -919.0, + "logps/rejected": -1637.0, + "loss": 0.5228, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.8291015625, + "rewards/margins": 8.3828125, + "rewards/rejected": -6.55859375, + "step": 1378 + }, + { + "epoch": 0.2736246837640756, + "grad_norm": 33.091254450485835, + "learning_rate": 9.200428683000494e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.3203125, + "logps/chosen": -954.0, + "logps/rejected": -956.0, + "loss": 0.3512, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.283203125, + "rewards/margins": 8.15625, + "rewards/rejected": -5.875, + "step": 1379 + }, + { + "epoch": 0.2738231063048762, + "grad_norm": 44.872261953423376, + "learning_rate": 9.198654326192431e-07, + "logits/chosen": 3.73828125, + "logits/rejected": 3.68359375, + "logps/chosen": -803.0, + "logps/rejected": -701.0, + "loss": 0.4257, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.333984375, + "rewards/margins": 6.890625, + "rewards/rejected": -5.5625, + "step": 1380 + }, + { + "epoch": 0.2740215288456769, + "grad_norm": 37.033662681499415, + "learning_rate": 9.196878195205588e-07, + "logits/chosen": 4.09375, + "logits/rejected": 4.15625, + "logps/chosen": -969.5, + "logps/rejected": -542.75, + "loss": 0.4162, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.244140625, + "rewards/margins": 5.53125, + "rewards/rejected": -4.28125, + "step": 1381 + }, + { + "epoch": 0.2742199513864775, + "grad_norm": 32.21343192184716, + "learning_rate": 9.195100290891943e-07, + "logits/chosen": 4.453125, + "logits/rejected": 4.3359375, + "logps/chosen": -1325.0, + "logps/rejected": -807.5, + "loss": 0.3728, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.767578125, + "rewards/margins": 7.7890625, + "rewards/rejected": -5.01953125, + "step": 1382 + }, + { + "epoch": 0.27441837392727814, + "grad_norm": 36.04800066197182, + "learning_rate": 9.193320614104325e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 3.98828125, + "logps/chosen": -921.0, + "logps/rejected": -630.5, + "loss": 0.5055, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.31640625, + "rewards/margins": 4.8046875, + "rewards/rejected": -2.4949951171875, + "step": 1383 + }, + { + "epoch": 0.2746167964680788, + "grad_norm": 32.192064966435304, + "learning_rate": 9.191539165696414e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 4.00390625, + "logps/chosen": -995.0, + "logps/rejected": -626.0, + "loss": 0.3644, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.654296875, + "rewards/margins": 6.3046875, + "rewards/rejected": -3.654296875, + "step": 1384 + }, + { + "epoch": 0.2748152190088794, + "grad_norm": 38.9244225048917, + "learning_rate": 9.18975594652274e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.8203125, + "logps/chosen": -894.0, + "logps/rejected": -483.0, + "loss": 0.5714, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.8740234375, + "rewards/margins": 3.16796875, + "rewards/rejected": -2.30078125, + "step": 1385 + }, + { + "epoch": 0.27501364154968005, + "grad_norm": 44.929975873054204, + "learning_rate": 9.18797095743868e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 4.15234375, + "logps/chosen": -969.0, + "logps/rejected": -843.0, + "loss": 0.4923, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.6015625, + "rewards/margins": 6.11328125, + "rewards/rejected": -4.50390625, + "step": 1386 + }, + { + "epoch": 0.27521206409048066, + "grad_norm": 35.953088015168035, + "learning_rate": 9.186184199300463e-07, + "logits/chosen": 3.6015625, + "logits/rejected": 3.7890625, + "logps/chosen": -702.0, + "logps/rejected": -930.5, + "loss": 0.5826, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.10009765625, + "rewards/margins": 4.5703125, + "rewards/rejected": -3.48046875, + "step": 1387 + }, + { + "epoch": 0.2754104866312813, + "grad_norm": 45.950192136630264, + "learning_rate": 9.184395672965164e-07, + "logits/chosen": 3.54296875, + "logits/rejected": 3.75, + "logps/chosen": -1211.0, + "logps/rejected": -782.0, + "loss": 0.5261, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.0576171875, + "rewards/margins": 4.7109375, + "rewards/rejected": -2.6484375, + "step": 1388 + }, + { + "epoch": 0.275608909172082, + "grad_norm": 31.730131756761146, + "learning_rate": 9.182605379290708e-07, + "logits/chosen": 4.390625, + "logits/rejected": 4.3671875, + "logps/chosen": -1110.0, + "logps/rejected": -903.0, + "loss": 0.4017, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.76953125, + "rewards/margins": 7.3984375, + "rewards/rejected": -4.625, + "step": 1389 + }, + { + "epoch": 0.2758073317128826, + "grad_norm": 30.804099750964635, + "learning_rate": 9.180813319135865e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 3.95703125, + "logps/chosen": -927.0, + "logps/rejected": -556.5, + "loss": 0.4946, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.044921875, + "rewards/margins": 5.23046875, + "rewards/rejected": -3.193359375, + "step": 1390 + }, + { + "epoch": 0.27600575425368323, + "grad_norm": 34.07914662305439, + "learning_rate": 9.179019493360257e-07, + "logits/chosen": 3.7734375, + "logits/rejected": 4.01953125, + "logps/chosen": -1356.0, + "logps/rejected": -1262.0, + "loss": 0.3464, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.20703125, + "rewards/margins": 6.5859375, + "rewards/rejected": -4.37109375, + "step": 1391 + }, + { + "epoch": 0.27620417679448384, + "grad_norm": 44.464630912840576, + "learning_rate": 9.177223902824349e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.1171875, + "logps/chosen": -866.0, + "logps/rejected": -728.0, + "loss": 0.526, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.5966796875, + "rewards/margins": 5.234375, + "rewards/rejected": -3.642578125, + "step": 1392 + }, + { + "epoch": 0.2764025993352845, + "grad_norm": 38.69115843645724, + "learning_rate": 9.175426548389454e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 4.1328125, + "logps/chosen": -1653.0, + "logps/rejected": -1105.0, + "loss": 0.3748, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4375, + "rewards/margins": 10.5703125, + "rewards/rejected": -8.140625, + "step": 1393 + }, + { + "epoch": 0.2766010218760851, + "grad_norm": 28.55127662849468, + "learning_rate": 9.173627430917733e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 3.6484375, + "logps/chosen": -1023.0, + "logps/rejected": -639.5, + "loss": 0.3673, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.0830078125, + "rewards/margins": 6.390625, + "rewards/rejected": -4.2890625, + "step": 1394 + }, + { + "epoch": 0.27679944441688575, + "grad_norm": 39.72630598328387, + "learning_rate": 9.171826551272188e-07, + "logits/chosen": 4.36328125, + "logits/rejected": 4.4140625, + "logps/chosen": -942.0, + "logps/rejected": -828.0, + "loss": 0.4095, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.083984375, + "rewards/margins": 6.515625, + "rewards/rejected": -4.4375, + "step": 1395 + }, + { + "epoch": 0.2769978669576864, + "grad_norm": 39.99363893765273, + "learning_rate": 9.17002391031667e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.234375, + "logps/chosen": -846.5, + "logps/rejected": -757.0, + "loss": 0.435, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.6728515625, + "rewards/margins": 6.189453125, + "rewards/rejected": -4.50390625, + "step": 1396 + }, + { + "epoch": 0.277196289498487, + "grad_norm": 31.66277351933195, + "learning_rate": 9.168219508915875e-07, + "logits/chosen": 3.5234375, + "logits/rejected": 3.45703125, + "logps/chosen": -1514.0, + "logps/rejected": -988.0, + "loss": 0.4269, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.51953125, + "rewards/margins": 6.9921875, + "rewards/rejected": -5.48046875, + "step": 1397 + }, + { + "epoch": 0.27739471203928767, + "grad_norm": 40.78088748182081, + "learning_rate": 9.166413347935339e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.78515625, + "logps/chosen": -953.0, + "logps/rejected": -740.0, + "loss": 0.4015, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.89453125, + "rewards/margins": 6.34765625, + "rewards/rejected": -3.4453125, + "step": 1398 + }, + { + "epoch": 0.2775931345800883, + "grad_norm": 37.25006188215525, + "learning_rate": 9.16460542824145e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.08984375, + "logps/chosen": -1179.0, + "logps/rejected": -963.0, + "loss": 0.4351, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.892578125, + "rewards/margins": 6.7265625, + "rewards/rejected": -4.83203125, + "step": 1399 + }, + { + "epoch": 0.27779155712088893, + "grad_norm": 40.67675275826809, + "learning_rate": 9.162795750701432e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.06640625, + "logps/chosen": -1528.0, + "logps/rejected": -805.0, + "loss": 0.4013, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.802734375, + "rewards/margins": 7.7109375, + "rewards/rejected": -4.90625, + "step": 1400 + }, + { + "epoch": 0.2779899796616896, + "grad_norm": 32.8562157210911, + "learning_rate": 9.160984316183354e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 3.94921875, + "logps/chosen": -1102.0, + "logps/rejected": -610.5, + "loss": 0.3456, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.142578125, + "rewards/margins": 6.8125, + "rewards/rejected": -4.66015625, + "step": 1401 + }, + { + "epoch": 0.2781884022024902, + "grad_norm": 30.598288497306097, + "learning_rate": 9.159171125556131e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.96484375, + "logps/chosen": -696.0, + "logps/rejected": -630.0, + "loss": 0.4501, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.826171875, + "rewards/margins": 16.84765625, + "rewards/rejected": -14.96875, + "step": 1402 + }, + { + "epoch": 0.27838682474329085, + "grad_norm": 43.33333833666016, + "learning_rate": 9.157356179689519e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.98046875, + "logps/chosen": -1144.0, + "logps/rejected": -894.0, + "loss": 0.4966, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.697265625, + "rewards/margins": 5.8984375, + "rewards/rejected": -4.203125, + "step": 1403 + }, + { + "epoch": 0.27858524728409145, + "grad_norm": 29.00648742715665, + "learning_rate": 9.155539479454113e-07, + "logits/chosen": 4.625, + "logits/rejected": 4.6328125, + "logps/chosen": -720.0, + "logps/rejected": -1537.0, + "loss": 0.4583, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.86328125, + "rewards/margins": 8.51953125, + "rewards/rejected": -6.6591796875, + "step": 1404 + }, + { + "epoch": 0.2787836698248921, + "grad_norm": 33.04457947924345, + "learning_rate": 9.153721025721355e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.88671875, + "logps/chosen": -807.5, + "logps/rejected": -838.0, + "loss": 0.4136, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.26953125, + "rewards/margins": 6.78125, + "rewards/rejected": -4.5078125, + "step": 1405 + }, + { + "epoch": 0.27898209236569277, + "grad_norm": 36.43352565305234, + "learning_rate": 9.151900819363521e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.53125, + "logps/chosen": -1632.0, + "logps/rejected": -977.0, + "loss": 0.4381, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.66796875, + "rewards/margins": 7.4375, + "rewards/rejected": -4.765625, + "step": 1406 + }, + { + "epoch": 0.27918051490649337, + "grad_norm": 42.215607602046795, + "learning_rate": 9.150078861253734e-07, + "logits/chosen": 4.015625, + "logits/rejected": 4.1015625, + "logps/chosen": -870.0, + "logps/rejected": -752.0, + "loss": 0.6009, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.0048828125, + "rewards/margins": 3.978515625, + "rewards/rejected": -2.9716796875, + "step": 1407 + }, + { + "epoch": 0.27937893744729403, + "grad_norm": 34.51942844226426, + "learning_rate": 9.148255152265954e-07, + "logits/chosen": 3.83984375, + "logits/rejected": 3.70703125, + "logps/chosen": -874.0, + "logps/rejected": -656.5, + "loss": 0.4479, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.259765625, + "rewards/margins": 5.89453125, + "rewards/rejected": -3.63671875, + "step": 1408 + }, + { + "epoch": 0.27957735998809463, + "grad_norm": 38.88843084818848, + "learning_rate": 9.146429693274984e-07, + "logits/chosen": 4.05078125, + "logits/rejected": 4.10546875, + "logps/chosen": -873.0, + "logps/rejected": -906.0, + "loss": 0.5423, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.5194091796875, + "rewards/margins": 5.705078125, + "rewards/rejected": -4.183837890625, + "step": 1409 + }, + { + "epoch": 0.2797757825288953, + "grad_norm": 34.253483732806046, + "learning_rate": 9.144602485156462e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.4375, + "logps/chosen": -1033.0, + "logps/rejected": -826.0, + "loss": 0.3335, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.126953125, + "rewards/margins": 7.5703125, + "rewards/rejected": -5.4609375, + "step": 1410 + }, + { + "epoch": 0.2799742050696959, + "grad_norm": 33.931509133696345, + "learning_rate": 9.142773528786868e-07, + "logits/chosen": 3.80859375, + "logits/rejected": 4.26953125, + "logps/chosen": -708.5, + "logps/rejected": -859.0, + "loss": 0.5378, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.48046875, + "rewards/margins": 5.1484375, + "rewards/rejected": -3.671875, + "step": 1411 + }, + { + "epoch": 0.28017262761049655, + "grad_norm": 38.12307913236109, + "learning_rate": 9.140942825043521e-07, + "logits/chosen": 4.6328125, + "logits/rejected": 4.34375, + "logps/chosen": -1345.0, + "logps/rejected": -944.0, + "loss": 0.4181, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.244140625, + "rewards/margins": 6.46875, + "rewards/rejected": -4.234375, + "step": 1412 + }, + { + "epoch": 0.2803710501512972, + "grad_norm": 30.12774398735557, + "learning_rate": 9.139110374804575e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.31640625, + "logps/chosen": -1121.0, + "logps/rejected": -796.0, + "loss": 0.3356, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.66015625, + "rewards/margins": 7.65625, + "rewards/rejected": -4.984375, + "step": 1413 + }, + { + "epoch": 0.2805694726920978, + "grad_norm": 28.01686509860935, + "learning_rate": 9.137276178949024e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 4.21484375, + "logps/chosen": -1370.0, + "logps/rejected": -1108.5, + "loss": 0.3803, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.21484375, + "rewards/margins": 7.546875, + "rewards/rejected": -4.3125, + "step": 1414 + }, + { + "epoch": 0.28076789523289847, + "grad_norm": 40.782458805376486, + "learning_rate": 9.135440238356699e-07, + "logits/chosen": 3.6640625, + "logits/rejected": 3.62890625, + "logps/chosen": -1054.0, + "logps/rejected": -790.0, + "loss": 0.3758, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.146484375, + "rewards/margins": 6.3984375, + "rewards/rejected": -4.24609375, + "step": 1415 + }, + { + "epoch": 0.28096631777369907, + "grad_norm": 26.4210114836652, + "learning_rate": 9.133602553908271e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 3.83203125, + "logps/chosen": -1584.0, + "logps/rejected": -887.0, + "loss": 0.3523, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.37060546875, + "rewards/margins": 7.66015625, + "rewards/rejected": -5.2890625, + "step": 1416 + }, + { + "epoch": 0.28116474031449973, + "grad_norm": 35.705205097462, + "learning_rate": 9.131763126485241e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 4.09375, + "logps/chosen": -1072.0, + "logps/rejected": -1512.0, + "loss": 0.4467, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.04296875, + "rewards/margins": 8.75, + "rewards/rejected": -6.6953125, + "step": 1417 + }, + { + "epoch": 0.2813631628553004, + "grad_norm": 34.818799509976714, + "learning_rate": 9.129921956969953e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.34375, + "logps/chosen": -930.0, + "logps/rejected": -628.5, + "loss": 0.4996, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.11083984375, + "rewards/margins": 5.3515625, + "rewards/rejected": -4.2421875, + "step": 1418 + }, + { + "epoch": 0.281561585396101, + "grad_norm": 30.737321775971004, + "learning_rate": 9.12807904624558e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 3.90234375, + "logps/chosen": -744.0, + "logps/rejected": -752.5, + "loss": 0.4739, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.728515625, + "rewards/margins": 6.390625, + "rewards/rejected": -4.65625, + "step": 1419 + }, + { + "epoch": 0.28176000793690165, + "grad_norm": 39.9063696406899, + "learning_rate": 9.126234395196136e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.19921875, + "logps/chosen": -795.0, + "logps/rejected": -646.5, + "loss": 0.3971, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.8564453125, + "rewards/margins": 7.34375, + "rewards/rejected": -5.48828125, + "step": 1420 + }, + { + "epoch": 0.28195843047770225, + "grad_norm": 31.036011178432645, + "learning_rate": 9.124388004706466e-07, + "logits/chosen": 4.41015625, + "logits/rejected": 4.4140625, + "logps/chosen": -687.0, + "logps/rejected": -1749.5, + "loss": 0.5687, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.0244140625, + "rewards/margins": 10.125, + "rewards/rejected": -9.109375, + "step": 1421 + }, + { + "epoch": 0.2821568530185029, + "grad_norm": 37.38583141126724, + "learning_rate": 9.122539875662253e-07, + "logits/chosen": 3.7265625, + "logits/rejected": 3.90234375, + "logps/chosen": -750.0, + "logps/rejected": -748.5, + "loss": 0.5461, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.029541015625, + "rewards/margins": 5.546875, + "rewards/rejected": -4.51171875, + "step": 1422 + }, + { + "epoch": 0.28235527555930356, + "grad_norm": 32.36049670911889, + "learning_rate": 9.120690008950008e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.3046875, + "logps/chosen": -1208.0, + "logps/rejected": -1042.0, + "loss": 0.4812, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.89453125, + "rewards/margins": 7.4296875, + "rewards/rejected": -4.53515625, + "step": 1423 + }, + { + "epoch": 0.28255369810010417, + "grad_norm": 34.61313870490656, + "learning_rate": 9.118838405457081e-07, + "logits/chosen": 3.953125, + "logits/rejected": 4.09765625, + "logps/chosen": -1107.0, + "logps/rejected": -740.0, + "loss": 0.5481, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.328125, + "rewards/margins": 5.28515625, + "rewards/rejected": -3.9609375, + "step": 1424 + }, + { + "epoch": 0.2827521206409048, + "grad_norm": 37.380443060095146, + "learning_rate": 9.116985066071652e-07, + "logits/chosen": 4.4765625, + "logits/rejected": 4.6484375, + "logps/chosen": -1094.0, + "logps/rejected": -692.0, + "loss": 0.4506, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.546875, + "rewards/margins": 5.8515625, + "rewards/rejected": -4.3046875, + "step": 1425 + }, + { + "epoch": 0.2829505431817054, + "grad_norm": 34.09408354575063, + "learning_rate": 9.115129991682738e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.40625, + "logps/chosen": -1099.0, + "logps/rejected": -894.0, + "loss": 0.3263, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.51953125, + "rewards/margins": 7.5625, + "rewards/rejected": -5.046875, + "step": 1426 + }, + { + "epoch": 0.2831489657225061, + "grad_norm": 31.798391752654116, + "learning_rate": 9.113273183180184e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 4.17578125, + "logps/chosen": -954.0, + "logps/rejected": -881.0, + "loss": 0.3663, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.34375, + "rewards/margins": 18.1640625, + "rewards/rejected": -15.7578125, + "step": 1427 + }, + { + "epoch": 0.2833473882633067, + "grad_norm": 36.315344305866624, + "learning_rate": 9.111414641454667e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.1796875, + "logps/chosen": -770.5, + "logps/rejected": -589.0, + "loss": 0.592, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.4609375, + "rewards/margins": 3.99609375, + "rewards/rejected": -2.529296875, + "step": 1428 + }, + { + "epoch": 0.28354581080410735, + "grad_norm": 40.19679497881237, + "learning_rate": 9.109554367397697e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 3.6640625, + "logps/chosen": -1006.0, + "logps/rejected": -632.0, + "loss": 0.4748, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.51904296875, + "rewards/margins": 5.03125, + "rewards/rejected": -3.51171875, + "step": 1429 + }, + { + "epoch": 0.283744233344908, + "grad_norm": 32.84761522331136, + "learning_rate": 9.107692361901614e-07, + "logits/chosen": 4.625, + "logits/rejected": 4.5234375, + "logps/chosen": -692.5, + "logps/rejected": -622.5, + "loss": 0.3822, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.8125, + "rewards/margins": 6.4765625, + "rewards/rejected": -4.671875, + "step": 1430 + }, + { + "epoch": 0.2839426558857086, + "grad_norm": 28.313109882313288, + "learning_rate": 9.10582862585959e-07, + "logits/chosen": 3.75, + "logits/rejected": 3.640625, + "logps/chosen": -809.5, + "logps/rejected": -600.0, + "loss": 0.553, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.76171875, + "rewards/margins": 4.87890625, + "rewards/rejected": -3.12109375, + "step": 1431 + }, + { + "epoch": 0.28414107842650926, + "grad_norm": 43.4042687485426, + "learning_rate": 9.103963160165626e-07, + "logits/chosen": 3.734375, + "logits/rejected": 4.234375, + "logps/chosen": -989.5, + "logps/rejected": -853.0, + "loss": 0.5946, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.4072265625, + "rewards/margins": 5.072265625, + "rewards/rejected": -3.67578125, + "step": 1432 + }, + { + "epoch": 0.28433950096730987, + "grad_norm": 41.255065421616266, + "learning_rate": 9.102095965714551e-07, + "logits/chosen": 4.046875, + "logits/rejected": 4.1484375, + "logps/chosen": -1075.0, + "logps/rejected": -1510.0, + "loss": 0.4561, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.986328125, + "rewards/margins": 9.171875, + "rewards/rejected": -7.1640625, + "step": 1433 + }, + { + "epoch": 0.2845379235081105, + "grad_norm": 33.38276980558344, + "learning_rate": 9.100227043402026e-07, + "logits/chosen": 4.6484375, + "logits/rejected": 4.6015625, + "logps/chosen": -825.0, + "logps/rejected": -706.0, + "loss": 0.4363, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.044921875, + "rewards/margins": 6.046875, + "rewards/rejected": -3.98828125, + "step": 1434 + }, + { + "epoch": 0.2847363460489112, + "grad_norm": 30.66269152904942, + "learning_rate": 9.098356394124542e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.3984375, + "logps/chosen": -840.0, + "logps/rejected": -816.0, + "loss": 0.4807, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.5302734375, + "rewards/margins": 5.40625, + "rewards/rejected": -3.8671875, + "step": 1435 + }, + { + "epoch": 0.2849347685897118, + "grad_norm": 31.25675601937395, + "learning_rate": 9.096484018779414e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.4765625, + "logps/chosen": -929.0, + "logps/rejected": -598.5, + "loss": 0.5838, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.4423828125, + "rewards/margins": 4.65625, + "rewards/rejected": -3.22265625, + "step": 1436 + }, + { + "epoch": 0.28513319113051244, + "grad_norm": 30.90643828821777, + "learning_rate": 9.094609918264786e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.20703125, + "logps/chosen": -974.0, + "logps/rejected": -772.0, + "loss": 0.3842, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.126953125, + "rewards/margins": 6.546875, + "rewards/rejected": -4.41015625, + "step": 1437 + }, + { + "epoch": 0.28533161367131304, + "grad_norm": 35.61681470010279, + "learning_rate": 9.092734093479633e-07, + "logits/chosen": 4.41796875, + "logits/rejected": 4.5, + "logps/chosen": -973.0, + "logps/rejected": -560.5, + "loss": 0.4684, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.0224609375, + "rewards/margins": 4.8125, + "rewards/rejected": -2.7890625, + "step": 1438 + }, + { + "epoch": 0.2855300362121137, + "grad_norm": 40.24771380292898, + "learning_rate": 9.090856545323754e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 4.16796875, + "logps/chosen": -1491.0, + "logps/rejected": -871.5, + "loss": 0.2729, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.2578125, + "rewards/margins": 7.375, + "rewards/rejected": -5.1171875, + "step": 1439 + }, + { + "epoch": 0.2857284587529143, + "grad_norm": 37.215355051506776, + "learning_rate": 9.088977274697775e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 3.796875, + "logps/chosen": -931.0, + "logps/rejected": -547.0, + "loss": 0.3646, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.099609375, + "rewards/margins": 5.8828125, + "rewards/rejected": -3.78125, + "step": 1440 + }, + { + "epoch": 0.28592688129371496, + "grad_norm": 42.53502563409483, + "learning_rate": 9.08709628250315e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 3.86328125, + "logps/chosen": -885.0, + "logps/rejected": -1236.5, + "loss": 0.444, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.193359375, + "rewards/margins": 6.3125, + "rewards/rejected": -4.11328125, + "step": 1441 + }, + { + "epoch": 0.2861253038345156, + "grad_norm": 37.57405872465807, + "learning_rate": 9.085213569642158e-07, + "logits/chosen": 3.875, + "logits/rejected": 4.046875, + "logps/chosen": -1001.0, + "logps/rejected": -808.0, + "loss": 0.4509, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.556640625, + "rewards/margins": 5.1015625, + "rewards/rejected": -2.54296875, + "step": 1442 + }, + { + "epoch": 0.2863237263753162, + "grad_norm": 41.94013403123614, + "learning_rate": 9.0833291370179e-07, + "logits/chosen": 4.4140625, + "logits/rejected": 4.12109375, + "logps/chosen": -931.0, + "logps/rejected": -672.5, + "loss": 0.4822, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.001953125, + "rewards/margins": 5.8203125, + "rewards/rejected": -3.82421875, + "step": 1443 + }, + { + "epoch": 0.2865221489161169, + "grad_norm": 36.41030850085537, + "learning_rate": 9.081442985534308e-07, + "logits/chosen": 4.33203125, + "logits/rejected": 4.48046875, + "logps/chosen": -729.0, + "logps/rejected": -842.0, + "loss": 0.4753, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.140625, + "rewards/margins": 5.0859375, + "rewards/rejected": -2.947265625, + "step": 1444 + }, + { + "epoch": 0.2867205714569175, + "grad_norm": 38.96075366595454, + "learning_rate": 9.079555116096133e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 3.88671875, + "logps/chosen": -869.0, + "logps/rejected": -682.0, + "loss": 0.5365, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.6474609375, + "rewards/margins": 13.4609375, + "rewards/rejected": -11.8203125, + "step": 1445 + }, + { + "epoch": 0.28691899399771814, + "grad_norm": 35.431659067124464, + "learning_rate": 9.077665529608955e-07, + "logits/chosen": 3.640625, + "logits/rejected": 3.74609375, + "logps/chosen": -972.5, + "logps/rejected": -720.0, + "loss": 0.5252, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9755859375, + "rewards/margins": 4.41796875, + "rewards/rejected": -3.435546875, + "step": 1446 + }, + { + "epoch": 0.2871174165385188, + "grad_norm": 40.75117678197697, + "learning_rate": 9.075774226979171e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 4.25390625, + "logps/chosen": -713.0, + "logps/rejected": -1176.5, + "loss": 0.5489, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.791015625, + "rewards/margins": 6.1796875, + "rewards/rejected": -4.3828125, + "step": 1447 + }, + { + "epoch": 0.2873158390793194, + "grad_norm": 33.004060848194236, + "learning_rate": 9.073881209114009e-07, + "logits/chosen": 3.9375, + "logits/rejected": 4.0546875, + "logps/chosen": -1178.0, + "logps/rejected": -993.0, + "loss": 0.4494, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.423828125, + "rewards/margins": 9.578125, + "rewards/rejected": -7.1484375, + "step": 1448 + }, + { + "epoch": 0.28751426162012006, + "grad_norm": 35.817020699924484, + "learning_rate": 9.071986476921514e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.2265625, + "logps/chosen": -866.0, + "logps/rejected": -662.0, + "loss": 0.4063, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.990234375, + "rewards/margins": 7.140625, + "rewards/rejected": -5.15234375, + "step": 1449 + }, + { + "epoch": 0.28771268416092066, + "grad_norm": 43.914746600805906, + "learning_rate": 9.070090031310558e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.421875, + "logps/chosen": -1268.0, + "logps/rejected": -1230.0, + "loss": 0.5576, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.57421875, + "rewards/margins": 6.125, + "rewards/rejected": -4.55078125, + "step": 1450 + }, + { + "epoch": 0.2879111067017213, + "grad_norm": 40.904468937576205, + "learning_rate": 9.068191873190829e-07, + "logits/chosen": 3.78515625, + "logits/rejected": 3.68359375, + "logps/chosen": -1335.0, + "logps/rejected": -742.0, + "loss": 0.3944, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.673828125, + "rewards/margins": 7.54296875, + "rewards/rejected": -5.8671875, + "step": 1451 + }, + { + "epoch": 0.288109529242522, + "grad_norm": 52.55722902694298, + "learning_rate": 9.066292003472842e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 4.25, + "logps/chosen": -1523.0, + "logps/rejected": -634.0, + "loss": 0.4743, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.06640625, + "rewards/margins": 7.125, + "rewards/rejected": -4.05859375, + "step": 1452 + }, + { + "epoch": 0.2883079517833226, + "grad_norm": 34.94494816247566, + "learning_rate": 9.06439042306793e-07, + "logits/chosen": 4.14453125, + "logits/rejected": 3.93359375, + "logps/chosen": -1145.0, + "logps/rejected": -605.5, + "loss": 0.347, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.89453125, + "rewards/margins": 6.3515625, + "rewards/rejected": -3.45703125, + "step": 1453 + }, + { + "epoch": 0.28850637432412324, + "grad_norm": 35.62599668666449, + "learning_rate": 9.062487132888247e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.0859375, + "logps/chosen": -990.0, + "logps/rejected": -636.5, + "loss": 0.4331, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.63671875, + "rewards/margins": 5.484375, + "rewards/rejected": -2.8359375, + "step": 1454 + }, + { + "epoch": 0.28870479686492384, + "grad_norm": 38.616726425034464, + "learning_rate": 9.060582133846768e-07, + "logits/chosen": 4.5234375, + "logits/rejected": 4.421875, + "logps/chosen": -898.0, + "logps/rejected": -582.0, + "loss": 0.4341, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.509765625, + "rewards/margins": 5.5859375, + "rewards/rejected": -4.078125, + "step": 1455 + }, + { + "epoch": 0.2889032194057245, + "grad_norm": 41.866181577616736, + "learning_rate": 9.058675426857286e-07, + "logits/chosen": 3.77734375, + "logits/rejected": 3.765625, + "logps/chosen": -1235.0, + "logps/rejected": -776.0, + "loss": 0.3203, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.525390625, + "rewards/margins": 6.7265625, + "rewards/rejected": -4.20703125, + "step": 1456 + }, + { + "epoch": 0.2891016419465251, + "grad_norm": 33.90832972065908, + "learning_rate": 9.056767012834416e-07, + "logits/chosen": 4.328125, + "logits/rejected": 4.609375, + "logps/chosen": -845.5, + "logps/rejected": -826.0, + "loss": 0.5345, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.01171875, + "rewards/margins": 6.5546875, + "rewards/rejected": -3.54296875, + "step": 1457 + }, + { + "epoch": 0.28930006448732576, + "grad_norm": 39.18187717434886, + "learning_rate": 9.05485689269359e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 3.83203125, + "logps/chosen": -1042.0, + "logps/rejected": -631.0, + "loss": 0.5822, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.541015625, + "rewards/margins": 5.2265625, + "rewards/rejected": -3.68359375, + "step": 1458 + }, + { + "epoch": 0.2894984870281264, + "grad_norm": 34.63873605157501, + "learning_rate": 9.052945067351059e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 3.93359375, + "logps/chosen": -967.0, + "logps/rejected": -650.5, + "loss": 0.5259, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.85546875, + "rewards/margins": 4.9296875, + "rewards/rejected": -3.0673828125, + "step": 1459 + }, + { + "epoch": 0.289696909568927, + "grad_norm": 34.40726214265432, + "learning_rate": 9.05103153772389e-07, + "logits/chosen": 3.890625, + "logits/rejected": 4.01171875, + "logps/chosen": -875.0, + "logps/rejected": -753.5, + "loss": 0.4307, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.7294921875, + "rewards/margins": 5.6953125, + "rewards/rejected": -3.9609375, + "step": 1460 + }, + { + "epoch": 0.2898953321097277, + "grad_norm": 36.575734924136306, + "learning_rate": 9.049116304729971e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 4.28125, + "logps/chosen": -1024.0, + "logps/rejected": -963.0, + "loss": 0.4388, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.029296875, + "rewards/margins": 6.15625, + "rewards/rejected": -4.12890625, + "step": 1461 + }, + { + "epoch": 0.2900937546505283, + "grad_norm": 33.198493813309966, + "learning_rate": 9.047199369288004e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 4.1953125, + "logps/chosen": -837.5, + "logps/rejected": -739.0, + "loss": 0.4632, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.1171875, + "rewards/margins": 5.48046875, + "rewards/rejected": -3.37109375, + "step": 1462 + }, + { + "epoch": 0.29029217719132894, + "grad_norm": 35.67236734997289, + "learning_rate": 9.04528073231751e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.08203125, + "logps/chosen": -1056.0, + "logps/rejected": -836.0, + "loss": 0.3845, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.87890625, + "rewards/margins": 7.796875, + "rewards/rejected": -5.8984375, + "step": 1463 + }, + { + "epoch": 0.2904905997321296, + "grad_norm": 34.623646644981946, + "learning_rate": 9.043360394738825e-07, + "logits/chosen": 4.0, + "logits/rejected": 4.09375, + "logps/chosen": -831.5, + "logps/rejected": -1467.5, + "loss": 0.3571, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.1240234375, + "rewards/margins": 9.1328125, + "rewards/rejected": -7.0078125, + "step": 1464 + }, + { + "epoch": 0.2906890222729302, + "grad_norm": 36.1416661283079, + "learning_rate": 9.0414383574731e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 3.8359375, + "logps/chosen": -909.0, + "logps/rejected": -1016.5, + "loss": 0.407, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.541015625, + "rewards/margins": 6.36328125, + "rewards/rejected": -4.8203125, + "step": 1465 + }, + { + "epoch": 0.29088744481373086, + "grad_norm": 39.4240378838169, + "learning_rate": 9.039514621442303e-07, + "logits/chosen": 4.0, + "logits/rejected": 3.9140625, + "logps/chosen": -962.5, + "logps/rejected": -740.5, + "loss": 0.4334, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.1240234375, + "rewards/margins": 6.265625, + "rewards/rejected": -4.13671875, + "step": 1466 + }, + { + "epoch": 0.29108586735453146, + "grad_norm": 44.609865714337914, + "learning_rate": 9.037589187569213e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.609375, + "logps/chosen": -609.5, + "logps/rejected": -645.0, + "loss": 0.6364, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.821044921875, + "rewards/margins": 4.69140625, + "rewards/rejected": -3.875, + "step": 1467 + }, + { + "epoch": 0.2912842898953321, + "grad_norm": 34.57540365540696, + "learning_rate": 9.035662056777431e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.83984375, + "logps/chosen": -705.0, + "logps/rejected": -427.5, + "loss": 0.4773, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.921630859375, + "rewards/margins": 4.45703125, + "rewards/rejected": -3.537109375, + "step": 1468 + }, + { + "epoch": 0.2914827124361327, + "grad_norm": 31.034352204496347, + "learning_rate": 9.033733229991365e-07, + "logits/chosen": 3.765625, + "logits/rejected": 3.91015625, + "logps/chosen": -888.5, + "logps/rejected": -1056.0, + "loss": 0.5147, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.54296875, + "rewards/margins": 5.796875, + "rewards/rejected": -4.25, + "step": 1469 + }, + { + "epoch": 0.2916811349769334, + "grad_norm": 44.36012206370168, + "learning_rate": 9.03180270813624e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.36328125, + "logps/chosen": -976.5, + "logps/rejected": -793.0, + "loss": 0.4827, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.8662109375, + "rewards/margins": 11.03515625, + "rewards/rejected": -9.185546875, + "step": 1470 + }, + { + "epoch": 0.29187955751773403, + "grad_norm": 30.433988974484365, + "learning_rate": 9.02987049213809e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.2890625, + "logps/chosen": -639.0, + "logps/rejected": -575.5, + "loss": 0.3865, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.224609375, + "rewards/margins": 6.1875, + "rewards/rejected": -3.966796875, + "step": 1471 + }, + { + "epoch": 0.29207798005853464, + "grad_norm": 39.330565812780854, + "learning_rate": 9.027936582923769e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.41796875, + "logps/chosen": -1333.0, + "logps/rejected": -1751.0, + "loss": 0.3751, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.64453125, + "rewards/margins": 9.9609375, + "rewards/rejected": -7.328125, + "step": 1472 + }, + { + "epoch": 0.2922764025993353, + "grad_norm": 36.111361882005106, + "learning_rate": 9.026000981420936e-07, + "logits/chosen": 4.453125, + "logits/rejected": 4.15234375, + "logps/chosen": -724.0, + "logps/rejected": -785.5, + "loss": 0.4285, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.869140625, + "rewards/margins": 5.9921875, + "rewards/rejected": -4.1171875, + "step": 1473 + }, + { + "epoch": 0.2924748251401359, + "grad_norm": 33.75123396673616, + "learning_rate": 9.024063688558066e-07, + "logits/chosen": 3.71484375, + "logits/rejected": 3.6953125, + "logps/chosen": -1068.0, + "logps/rejected": -889.5, + "loss": 0.3379, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.1171875, + "rewards/margins": 6.8515625, + "rewards/rejected": -4.734375, + "step": 1474 + }, + { + "epoch": 0.29267324768093655, + "grad_norm": 53.25494880505754, + "learning_rate": 9.022124705264443e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 3.85546875, + "logps/chosen": -1240.0, + "logps/rejected": -794.0, + "loss": 0.4225, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.66015625, + "rewards/margins": 6.9609375, + "rewards/rejected": -4.30078125, + "step": 1475 + }, + { + "epoch": 0.2928716702217372, + "grad_norm": 35.125486820713505, + "learning_rate": 9.020184032470163e-07, + "logits/chosen": 3.796875, + "logits/rejected": 3.96875, + "logps/chosen": -725.0, + "logps/rejected": -631.0, + "loss": 0.3508, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.3427734375, + "rewards/margins": 6.61328125, + "rewards/rejected": -4.2734375, + "step": 1476 + }, + { + "epoch": 0.2930700927625378, + "grad_norm": 27.87572269197596, + "learning_rate": 9.018241671106134e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.8125, + "logps/chosen": -1096.0, + "logps/rejected": -756.5, + "loss": 0.378, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.6572265625, + "rewards/margins": 5.8046875, + "rewards/rejected": -4.1484375, + "step": 1477 + }, + { + "epoch": 0.2932685153033385, + "grad_norm": 38.54470162201667, + "learning_rate": 9.016297622104069e-07, + "logits/chosen": 4.37109375, + "logits/rejected": 4.3984375, + "logps/chosen": -1276.0, + "logps/rejected": -752.0, + "loss": 0.4478, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.90234375, + "rewards/margins": 6.515625, + "rewards/rejected": -4.6171875, + "step": 1478 + }, + { + "epoch": 0.2934669378441391, + "grad_norm": 31.652596445877226, + "learning_rate": 9.014351886396499e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 4.30078125, + "logps/chosen": -1132.0, + "logps/rejected": -700.0, + "loss": 0.3452, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.58203125, + "rewards/margins": 6.9453125, + "rewards/rejected": -4.37109375, + "step": 1479 + }, + { + "epoch": 0.29366536038493973, + "grad_norm": 37.322481393737135, + "learning_rate": 9.012404464916752e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 3.6640625, + "logps/chosen": -812.0, + "logps/rejected": -564.5, + "loss": 0.418, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.6201171875, + "rewards/margins": 5.9140625, + "rewards/rejected": -4.2890625, + "step": 1480 + }, + { + "epoch": 0.2938637829257404, + "grad_norm": 37.573581982317194, + "learning_rate": 9.010455358598976e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.31640625, + "logps/chosen": -803.0, + "logps/rejected": -687.5, + "loss": 0.5552, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.3515625, + "rewards/margins": 5.421875, + "rewards/rejected": -4.0625, + "step": 1481 + }, + { + "epoch": 0.294062205466541, + "grad_norm": 33.084261194466066, + "learning_rate": 9.008504568378122e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 3.96875, + "logps/chosen": -1232.0, + "logps/rejected": -1488.0, + "loss": 0.2665, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4375, + "rewards/margins": 9.8359375, + "rewards/rejected": -7.375, + "step": 1482 + }, + { + "epoch": 0.29426062800734165, + "grad_norm": 44.79748954377129, + "learning_rate": 9.006552095189948e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.00390625, + "logps/chosen": -1151.0, + "logps/rejected": -831.0, + "loss": 0.4416, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.994140625, + "rewards/margins": 6.5, + "rewards/rejected": -4.51171875, + "step": 1483 + }, + { + "epoch": 0.29445905054814225, + "grad_norm": 35.757217418197925, + "learning_rate": 9.004597939971024e-07, + "logits/chosen": 3.671875, + "logits/rejected": 3.546875, + "logps/chosen": -1103.0, + "logps/rejected": -793.0, + "loss": 0.6507, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.822265625, + "rewards/margins": 5.46875, + "rewards/rejected": -3.64453125, + "step": 1484 + }, + { + "epoch": 0.2946574730889429, + "grad_norm": 32.84752156849811, + "learning_rate": 9.002642103658719e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 4.19921875, + "logps/chosen": -1013.0, + "logps/rejected": -742.0, + "loss": 0.4703, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.9287109375, + "rewards/margins": 6.98828125, + "rewards/rejected": -5.0546875, + "step": 1485 + }, + { + "epoch": 0.2948558956297435, + "grad_norm": 34.19376066584701, + "learning_rate": 9.000684587191217e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.8125, + "logps/chosen": -810.0, + "logps/rejected": -552.5, + "loss": 0.4855, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.0625, + "rewards/margins": 5.0, + "rewards/rejected": -2.9453125, + "step": 1486 + }, + { + "epoch": 0.29505431817054417, + "grad_norm": 36.11971412614244, + "learning_rate": 8.998725391507501e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 3.99609375, + "logps/chosen": -943.0, + "logps/rejected": -765.0, + "loss": 0.4532, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.654296875, + "rewards/margins": 6.3203125, + "rewards/rejected": -4.66015625, + "step": 1487 + }, + { + "epoch": 0.29525274071134483, + "grad_norm": 27.420480290672334, + "learning_rate": 8.996764517547364e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.02734375, + "logps/chosen": -917.0, + "logps/rejected": -651.0, + "loss": 0.543, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.0146484375, + "rewards/margins": 4.91015625, + "rewards/rejected": -3.8984375, + "step": 1488 + }, + { + "epoch": 0.29545116325214543, + "grad_norm": 34.36985955467364, + "learning_rate": 8.994801966251403e-07, + "logits/chosen": 3.6328125, + "logits/rejected": 3.73046875, + "logps/chosen": -927.0, + "logps/rejected": -1002.0, + "loss": 0.4547, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.423828125, + "rewards/margins": 7.90625, + "rewards/rejected": -6.5078125, + "step": 1489 + }, + { + "epoch": 0.2956495857929461, + "grad_norm": 33.62170691568171, + "learning_rate": 8.992837738561016e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 3.87109375, + "logps/chosen": -1135.0, + "logps/rejected": -733.0, + "loss": 0.3091, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1083984375, + "rewards/margins": 7.171875, + "rewards/rejected": -4.0625, + "step": 1490 + }, + { + "epoch": 0.2958480083337467, + "grad_norm": 42.84461031294581, + "learning_rate": 8.990871835418411e-07, + "logits/chosen": 3.921875, + "logits/rejected": 3.8515625, + "logps/chosen": -1098.0, + "logps/rejected": -742.0, + "loss": 0.3873, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.8359375, + "rewards/margins": 6.71875, + "rewards/rejected": -4.875, + "step": 1491 + }, + { + "epoch": 0.29604643087454735, + "grad_norm": 34.26279040241645, + "learning_rate": 8.988904257766595e-07, + "logits/chosen": 3.58203125, + "logits/rejected": 4.13671875, + "logps/chosen": -738.5, + "logps/rejected": -781.0, + "loss": 0.5714, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.59552001953125, + "rewards/margins": 5.6484375, + "rewards/rejected": -5.0625, + "step": 1492 + }, + { + "epoch": 0.296244853415348, + "grad_norm": 25.111545622626622, + "learning_rate": 8.986935006549382e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 3.546875, + "logps/chosen": -1057.0, + "logps/rejected": -688.5, + "loss": 0.3087, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.068359375, + "rewards/margins": 7.90625, + "rewards/rejected": -5.84375, + "step": 1493 + }, + { + "epoch": 0.2964432759561486, + "grad_norm": 41.34584482213453, + "learning_rate": 8.984964082711386e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 4.07421875, + "logps/chosen": -1194.0, + "logps/rejected": -995.0, + "loss": 0.4611, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.623046875, + "rewards/margins": 6.2265625, + "rewards/rejected": -4.59765625, + "step": 1494 + }, + { + "epoch": 0.29664169849694927, + "grad_norm": 26.934922274098295, + "learning_rate": 8.982991487198023e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 4.109375, + "logps/chosen": -792.5, + "logps/rejected": -1660.0, + "loss": 0.5184, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.943359375, + "rewards/margins": 12.296875, + "rewards/rejected": -11.328125, + "step": 1495 + }, + { + "epoch": 0.29684012103774987, + "grad_norm": 39.53617630653399, + "learning_rate": 8.981017220955515e-07, + "logits/chosen": 3.671875, + "logits/rejected": 4.01953125, + "logps/chosen": -846.0, + "logps/rejected": -932.0, + "loss": 0.4237, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.217529296875, + "rewards/margins": 11.1640625, + "rewards/rejected": -9.9453125, + "step": 1496 + }, + { + "epoch": 0.29703854357855053, + "grad_norm": 39.49394163397008, + "learning_rate": 8.979041284930877e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.328125, + "logps/chosen": -1361.0, + "logps/rejected": -851.0, + "loss": 0.4051, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.25390625, + "rewards/margins": 7.1953125, + "rewards/rejected": -4.94140625, + "step": 1497 + }, + { + "epoch": 0.29723696611935113, + "grad_norm": 47.046518301865405, + "learning_rate": 8.977063680071935e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 3.83984375, + "logps/chosen": -991.0, + "logps/rejected": -599.5, + "loss": 0.4673, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.5048828125, + "rewards/margins": 6.5078125, + "rewards/rejected": -4.9921875, + "step": 1498 + }, + { + "epoch": 0.2974353886601518, + "grad_norm": 30.596197656211853, + "learning_rate": 8.975084407327311e-07, + "logits/chosen": 3.78515625, + "logits/rejected": 3.86328125, + "logps/chosen": -899.0, + "logps/rejected": -1118.0, + "loss": 0.4172, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.8310546875, + "rewards/margins": 6.7890625, + "rewards/rejected": -4.955078125, + "step": 1499 + }, + { + "epoch": 0.29763381120095245, + "grad_norm": 31.22891288657906, + "learning_rate": 8.973103467646425e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 3.98046875, + "logps/chosen": -873.0, + "logps/rejected": -563.0, + "loss": 0.4047, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.080078125, + "rewards/margins": 5.4765625, + "rewards/rejected": -3.40625, + "step": 1500 + }, + { + "epoch": 0.29783223374175305, + "grad_norm": 42.2040331499187, + "learning_rate": 8.971120861979499e-07, + "logits/chosen": 3.60546875, + "logits/rejected": 3.5859375, + "logps/chosen": -919.0, + "logps/rejected": -637.0, + "loss": 0.4355, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.5703125, + "rewards/margins": 6.171875, + "rewards/rejected": -4.6015625, + "step": 1501 + }, + { + "epoch": 0.2980306562825537, + "grad_norm": 30.516859852451706, + "learning_rate": 8.969136591277553e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.13671875, + "logps/chosen": -895.0, + "logps/rejected": -628.5, + "loss": 0.4151, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.50390625, + "rewards/margins": 6.140625, + "rewards/rejected": -4.6328125, + "step": 1502 + }, + { + "epoch": 0.2982290788233543, + "grad_norm": 33.70997270633689, + "learning_rate": 8.967150656492408e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 4.078125, + "logps/chosen": -939.0, + "logps/rejected": -982.5, + "loss": 0.3663, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.0234375, + "rewards/margins": 8.6875, + "rewards/rejected": -6.65625, + "step": 1503 + }, + { + "epoch": 0.29842750136415497, + "grad_norm": 38.84228862332853, + "learning_rate": 8.965163058576683e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.3828125, + "logps/chosen": -935.0, + "logps/rejected": -738.0, + "loss": 0.4787, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.921875, + "rewards/margins": 5.88671875, + "rewards/rejected": -3.974609375, + "step": 1504 + }, + { + "epoch": 0.2986259239049556, + "grad_norm": 33.6390095543396, + "learning_rate": 8.963173798483789e-07, + "logits/chosen": 4.09375, + "logits/rejected": 3.80078125, + "logps/chosen": -501.0, + "logps/rejected": -494.0, + "loss": 0.6102, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.703125, + "rewards/margins": 3.865234375, + "rewards/rejected": -3.162109375, + "step": 1505 + }, + { + "epoch": 0.29882434644575623, + "grad_norm": 46.59636803027213, + "learning_rate": 8.961182877167943e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.14453125, + "logps/chosen": -1205.5, + "logps/rejected": -759.0, + "loss": 0.4648, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.705078125, + "rewards/margins": 6.5546875, + "rewards/rejected": -3.83984375, + "step": 1506 + }, + { + "epoch": 0.2990227689865569, + "grad_norm": 37.67768347313928, + "learning_rate": 8.95919029558415e-07, + "logits/chosen": 3.796875, + "logits/rejected": 4.12890625, + "logps/chosen": -864.0, + "logps/rejected": -659.5, + "loss": 0.433, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.12353515625, + "rewards/margins": 6.4765625, + "rewards/rejected": -5.3515625, + "step": 1507 + }, + { + "epoch": 0.2992211915273575, + "grad_norm": 38.61039354876279, + "learning_rate": 8.957196054688221e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 3.9375, + "logps/chosen": -921.0, + "logps/rejected": -1693.0, + "loss": 0.4289, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.712890625, + "rewards/margins": 8.2109375, + "rewards/rejected": -6.50390625, + "step": 1508 + }, + { + "epoch": 0.29941961406815815, + "grad_norm": 34.66902250373254, + "learning_rate": 8.955200155436756e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.3984375, + "logps/chosen": -1072.0, + "logps/rejected": -1285.0, + "loss": 0.3748, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.89990234375, + "rewards/margins": 8.2421875, + "rewards/rejected": -6.3359375, + "step": 1509 + }, + { + "epoch": 0.2996180366089588, + "grad_norm": 39.74244101360558, + "learning_rate": 8.953202598787152e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 3.96484375, + "logps/chosen": -1074.0, + "logps/rejected": -694.0, + "loss": 0.5698, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.2451171875, + "rewards/margins": 4.9765625, + "rewards/rejected": -3.7265625, + "step": 1510 + }, + { + "epoch": 0.2998164591497594, + "grad_norm": 50.28126641215152, + "learning_rate": 8.9512033856976e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.04296875, + "logps/chosen": -1348.0, + "logps/rejected": -832.0, + "loss": 0.4222, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.705078125, + "rewards/margins": 6.15625, + "rewards/rejected": -3.453125, + "step": 1511 + }, + { + "epoch": 0.30001488169056006, + "grad_norm": 47.927890448634344, + "learning_rate": 8.949202517127091e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.0859375, + "logps/chosen": -1197.0, + "logps/rejected": -765.0, + "loss": 0.4225, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.568359375, + "rewards/margins": 5.27734375, + "rewards/rejected": -3.7109375, + "step": 1512 + }, + { + "epoch": 0.30021330423136067, + "grad_norm": 42.75116514601597, + "learning_rate": 8.9471999940354e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.30859375, + "logps/chosen": -1063.0, + "logps/rejected": -952.0, + "loss": 0.5462, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.192626953125, + "rewards/margins": 6.234375, + "rewards/rejected": -5.04296875, + "step": 1513 + }, + { + "epoch": 0.3004117267721613, + "grad_norm": 42.427997350190225, + "learning_rate": 8.945195817383109e-07, + "logits/chosen": 4.140625, + "logits/rejected": 3.98828125, + "logps/chosen": -1033.0, + "logps/rejected": -801.0, + "loss": 0.4209, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.853515625, + "rewards/margins": 6.078125, + "rewards/rejected": -4.234375, + "step": 1514 + }, + { + "epoch": 0.3006101493129619, + "grad_norm": 27.50145990842592, + "learning_rate": 8.943189988131581e-07, + "logits/chosen": 4.26171875, + "logits/rejected": 4.140625, + "logps/chosen": -1191.5, + "logps/rejected": -971.0, + "loss": 0.4092, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.21875, + "rewards/margins": 7.9921875, + "rewards/rejected": -5.765625, + "step": 1515 + }, + { + "epoch": 0.3008085718537626, + "grad_norm": 36.03455149093631, + "learning_rate": 8.941182507242977e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 4.01171875, + "logps/chosen": -1198.0, + "logps/rejected": -732.0, + "loss": 0.4844, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.255859375, + "rewards/margins": 7.0390625, + "rewards/rejected": -4.77734375, + "step": 1516 + }, + { + "epoch": 0.30100699439456324, + "grad_norm": 35.809984403584195, + "learning_rate": 8.939173375680251e-07, + "logits/chosen": 3.5859375, + "logits/rejected": 3.4765625, + "logps/chosen": -708.5, + "logps/rejected": -652.5, + "loss": 0.4477, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.529296875, + "rewards/margins": 5.3515625, + "rewards/rejected": -3.8125, + "step": 1517 + }, + { + "epoch": 0.30120541693536385, + "grad_norm": 41.516036377523605, + "learning_rate": 8.93716259440715e-07, + "logits/chosen": 3.57421875, + "logits/rejected": 3.484375, + "logps/chosen": -978.0, + "logps/rejected": -612.0, + "loss": 0.4034, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.4599609375, + "rewards/margins": 5.71875, + "rewards/rejected": -4.251953125, + "step": 1518 + }, + { + "epoch": 0.3014038394761645, + "grad_norm": 34.07740713032206, + "learning_rate": 8.935150164388205e-07, + "logits/chosen": 4.578125, + "logits/rejected": 4.3828125, + "logps/chosen": -1176.5, + "logps/rejected": -701.0, + "loss": 0.4096, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.29296875, + "rewards/margins": 7.12109375, + "rewards/rejected": -4.818359375, + "step": 1519 + }, + { + "epoch": 0.3016022620169651, + "grad_norm": 37.916863496773914, + "learning_rate": 8.933136086588748e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.19140625, + "logps/chosen": -861.0, + "logps/rejected": -664.5, + "loss": 0.5265, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.6005859375, + "rewards/margins": 5.59765625, + "rewards/rejected": -3.99609375, + "step": 1520 + }, + { + "epoch": 0.30180068455776576, + "grad_norm": 42.92492343444945, + "learning_rate": 8.931120361974893e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 3.9765625, + "logps/chosen": -1267.0, + "logps/rejected": -810.0, + "loss": 0.4267, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.34375, + "rewards/margins": 6.0546875, + "rewards/rejected": -3.71484375, + "step": 1521 + }, + { + "epoch": 0.3019991070985664, + "grad_norm": 60.81378350733936, + "learning_rate": 8.929102991513549e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 3.9765625, + "logps/chosen": -1292.0, + "logps/rejected": -1274.0, + "loss": 0.535, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.795166015625, + "rewards/margins": 6.9375, + "rewards/rejected": -5.13671875, + "step": 1522 + }, + { + "epoch": 0.302197529639367, + "grad_norm": 33.950653038587355, + "learning_rate": 8.927083976172411e-07, + "logits/chosen": 3.828125, + "logits/rejected": 3.64453125, + "logps/chosen": -862.5, + "logps/rejected": -636.0, + "loss": 0.3825, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.041015625, + "rewards/margins": 6.63671875, + "rewards/rejected": -4.603515625, + "step": 1523 + }, + { + "epoch": 0.3023959521801677, + "grad_norm": 33.158193279323775, + "learning_rate": 8.925063316919965e-07, + "logits/chosen": 4.5234375, + "logits/rejected": 4.53125, + "logps/chosen": -1142.0, + "logps/rejected": -949.0, + "loss": 0.2982, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.984375, + "rewards/margins": 16.546875, + "rewards/rejected": -14.59375, + "step": 1524 + }, + { + "epoch": 0.3025943747209683, + "grad_norm": 31.467594448875154, + "learning_rate": 8.923041014725488e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.04296875, + "logps/chosen": -1206.0, + "logps/rejected": -721.0, + "loss": 0.3633, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.30078125, + "rewards/margins": 6.7734375, + "rewards/rejected": -4.4765625, + "step": 1525 + }, + { + "epoch": 0.30279279726176894, + "grad_norm": 39.36652332639045, + "learning_rate": 8.921017070559039e-07, + "logits/chosen": 3.54296875, + "logits/rejected": 3.59765625, + "logps/chosen": -1073.0, + "logps/rejected": -809.0, + "loss": 0.4161, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.119140625, + "rewards/margins": 6.875, + "rewards/rejected": -4.7578125, + "step": 1526 + }, + { + "epoch": 0.30299121980256954, + "grad_norm": 33.99468729096775, + "learning_rate": 8.918991485391471e-07, + "logits/chosen": 4.109375, + "logits/rejected": 3.953125, + "logps/chosen": -1069.0, + "logps/rejected": -555.0, + "loss": 0.333, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.63671875, + "rewards/margins": 6.5234375, + "rewards/rejected": -3.8828125, + "step": 1527 + }, + { + "epoch": 0.3031896423433702, + "grad_norm": 36.390750178143584, + "learning_rate": 8.91696426019442e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.74609375, + "logps/chosen": -874.0, + "logps/rejected": -470.5, + "loss": 0.3177, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.322265625, + "rewards/margins": 6.609375, + "rewards/rejected": -4.3046875, + "step": 1528 + }, + { + "epoch": 0.30338806488417086, + "grad_norm": 30.663441566821586, + "learning_rate": 8.914935395940309e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 3.99609375, + "logps/chosen": -1240.0, + "logps/rejected": -699.0, + "loss": 0.4222, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.310546875, + "rewards/margins": 5.90625, + "rewards/rejected": -3.60546875, + "step": 1529 + }, + { + "epoch": 0.30358648742497146, + "grad_norm": 46.129011556155895, + "learning_rate": 8.91290489360235e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 3.82421875, + "logps/chosen": -1134.5, + "logps/rejected": -728.0, + "loss": 0.6154, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.947265625, + "rewards/margins": 4.490234375, + "rewards/rejected": -3.54296875, + "step": 1530 + }, + { + "epoch": 0.3037849099657721, + "grad_norm": 36.760016023819404, + "learning_rate": 8.910872754154538e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.21484375, + "logps/chosen": -869.5, + "logps/rejected": -701.0, + "loss": 0.4999, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.849609375, + "rewards/margins": 5.9296875, + "rewards/rejected": -4.07421875, + "step": 1531 + }, + { + "epoch": 0.3039833325065727, + "grad_norm": 34.4246574641755, + "learning_rate": 8.908838978571653e-07, + "logits/chosen": 4.4453125, + "logits/rejected": 4.3984375, + "logps/chosen": -707.0, + "logps/rejected": -717.0, + "loss": 0.4416, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.573486328125, + "rewards/margins": 6.46875, + "rewards/rejected": -4.8828125, + "step": 1532 + }, + { + "epoch": 0.3041817550473734, + "grad_norm": 37.71245030844833, + "learning_rate": 8.906803567829263e-07, + "logits/chosen": 3.77734375, + "logits/rejected": 3.94140625, + "logps/chosen": -1262.0, + "logps/rejected": -895.0, + "loss": 0.4055, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.046875, + "rewards/margins": 7.34375, + "rewards/rejected": -5.296875, + "step": 1533 + }, + { + "epoch": 0.30438017758817404, + "grad_norm": 26.000798398160658, + "learning_rate": 8.904766522903718e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.40234375, + "logps/chosen": -997.0, + "logps/rejected": -782.0, + "loss": 0.4001, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.43359375, + "rewards/margins": 6.6171875, + "rewards/rejected": -4.17578125, + "step": 1534 + }, + { + "epoch": 0.30457860012897464, + "grad_norm": 39.73910770013902, + "learning_rate": 8.902727844772151e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 4.2421875, + "logps/chosen": -1181.0, + "logps/rejected": -808.0, + "loss": 0.4423, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.569580078125, + "rewards/margins": 7.140625, + "rewards/rejected": -5.56640625, + "step": 1535 + }, + { + "epoch": 0.3047770226697753, + "grad_norm": 33.65744447444155, + "learning_rate": 8.90068753441248e-07, + "logits/chosen": 4.39453125, + "logits/rejected": 4.6875, + "logps/chosen": -931.0, + "logps/rejected": -761.0, + "loss": 0.4007, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.7734375, + "rewards/margins": 7.0234375, + "rewards/rejected": -5.25, + "step": 1536 + }, + { + "epoch": 0.3049754452105759, + "grad_norm": 32.75031895717201, + "learning_rate": 8.898645592803406e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.11328125, + "logps/chosen": -1382.0, + "logps/rejected": -848.0, + "loss": 0.421, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.263671875, + "rewards/margins": 7.828125, + "rewards/rejected": -5.5625, + "step": 1537 + }, + { + "epoch": 0.30517386775137656, + "grad_norm": 42.13778436386167, + "learning_rate": 8.896602020924413e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 4.05859375, + "logps/chosen": -1086.0, + "logps/rejected": -841.0, + "loss": 0.4531, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.095703125, + "rewards/margins": 7.2265625, + "rewards/rejected": -5.12890625, + "step": 1538 + }, + { + "epoch": 0.3053722902921772, + "grad_norm": 30.09115010974661, + "learning_rate": 8.894556819755765e-07, + "logits/chosen": 4.078125, + "logits/rejected": 4.23828125, + "logps/chosen": -978.0, + "logps/rejected": -852.0, + "loss": 0.393, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.017578125, + "rewards/margins": 7.4375, + "rewards/rejected": -5.41015625, + "step": 1539 + }, + { + "epoch": 0.3055707128329778, + "grad_norm": 36.42339475126366, + "learning_rate": 8.892509990278508e-07, + "logits/chosen": 4.078125, + "logits/rejected": 4.18359375, + "logps/chosen": -844.0, + "logps/rejected": -2093.0, + "loss": 0.4912, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.6767578125, + "rewards/margins": 9.32421875, + "rewards/rejected": -7.662109375, + "step": 1540 + }, + { + "epoch": 0.3057691353737785, + "grad_norm": 34.9125719733291, + "learning_rate": 8.890461533474473e-07, + "logits/chosen": 4.48046875, + "logits/rejected": 4.5625, + "logps/chosen": -860.0, + "logps/rejected": -927.0, + "loss": 0.4749, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.953125, + "rewards/margins": 5.68359375, + "rewards/rejected": -3.716796875, + "step": 1541 + }, + { + "epoch": 0.3059675579145791, + "grad_norm": 38.42023945072649, + "learning_rate": 8.888411450326264e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 4.16796875, + "logps/chosen": -787.0, + "logps/rejected": -585.5, + "loss": 0.5559, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.68359375, + "rewards/margins": 5.171875, + "rewards/rejected": -3.484375, + "step": 1542 + }, + { + "epoch": 0.30616598045537974, + "grad_norm": 30.019885255510893, + "learning_rate": 8.886359741817273e-07, + "logits/chosen": 4.53125, + "logits/rejected": 4.1171875, + "logps/chosen": -1183.0, + "logps/rejected": -766.0, + "loss": 0.4957, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.99609375, + "rewards/margins": 5.703125, + "rewards/rejected": -3.7109375, + "step": 1543 + }, + { + "epoch": 0.30636440299618034, + "grad_norm": 38.99406795192995, + "learning_rate": 8.884306408931665e-07, + "logits/chosen": 3.96875, + "logits/rejected": 3.96484375, + "logps/chosen": -913.0, + "logps/rejected": -621.0, + "loss": 0.4498, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.6787109375, + "rewards/margins": 4.86328125, + "rewards/rejected": -3.181640625, + "step": 1544 + }, + { + "epoch": 0.306562825536981, + "grad_norm": 36.913793674731856, + "learning_rate": 8.882251452654392e-07, + "logits/chosen": 3.4765625, + "logits/rejected": 3.91015625, + "logps/chosen": -827.0, + "logps/rejected": -575.0, + "loss": 0.4763, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.44921875, + "rewards/margins": 4.3515625, + "rewards/rejected": -2.91015625, + "step": 1545 + }, + { + "epoch": 0.30676124807778166, + "grad_norm": 37.13220141811173, + "learning_rate": 8.880194873971174e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.921875, + "logps/chosen": -984.0, + "logps/rejected": -908.0, + "loss": 0.621, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.49462890625, + "rewards/margins": 5.36328125, + "rewards/rejected": -3.86328125, + "step": 1546 + }, + { + "epoch": 0.30695967061858226, + "grad_norm": 34.26986371668957, + "learning_rate": 8.87813667386852e-07, + "logits/chosen": 3.8359375, + "logits/rejected": 3.7109375, + "logps/chosen": -1065.0, + "logps/rejected": -665.0, + "loss": 0.4399, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.47021484375, + "rewards/margins": 6.5546875, + "rewards/rejected": -4.08203125, + "step": 1547 + }, + { + "epoch": 0.3071580931593829, + "grad_norm": 28.477937210110817, + "learning_rate": 8.87607685333371e-07, + "logits/chosen": 3.96875, + "logits/rejected": 4.00390625, + "logps/chosen": -986.0, + "logps/rejected": -1362.5, + "loss": 0.4823, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8125, + "rewards/margins": 7.53125, + "rewards/rejected": -5.7109375, + "step": 1548 + }, + { + "epoch": 0.3073565157001835, + "grad_norm": 38.73788578816611, + "learning_rate": 8.874015413354804e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.1875, + "logps/chosen": -1220.0, + "logps/rejected": -686.5, + "loss": 0.5017, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.81640625, + "rewards/margins": 6.2265625, + "rewards/rejected": -3.40234375, + "step": 1549 + }, + { + "epoch": 0.3075549382409842, + "grad_norm": 33.72122939576333, + "learning_rate": 8.871952354920637e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 3.79296875, + "logps/chosen": -750.5, + "logps/rejected": -553.0, + "loss": 0.4452, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2353515625, + "rewards/margins": 5.60546875, + "rewards/rejected": -4.37109375, + "step": 1550 + }, + { + "epoch": 0.30775336078178483, + "grad_norm": 31.219936409834883, + "learning_rate": 8.869887679020821e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.25390625, + "logps/chosen": -1030.0, + "logps/rejected": -784.0, + "loss": 0.4387, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.19921875, + "rewards/margins": 6.8515625, + "rewards/rejected": -4.64453125, + "step": 1551 + }, + { + "epoch": 0.30795178332258544, + "grad_norm": 38.160241942718585, + "learning_rate": 8.867821386645749e-07, + "logits/chosen": 3.8828125, + "logits/rejected": 4.1171875, + "logps/chosen": -614.75, + "logps/rejected": -711.0, + "loss": 0.4669, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.693359375, + "rewards/margins": 6.01953125, + "rewards/rejected": -4.318359375, + "step": 1552 + }, + { + "epoch": 0.3081502058633861, + "grad_norm": 37.722924924957034, + "learning_rate": 8.865753478786578e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 4.03125, + "logps/chosen": -1384.0, + "logps/rejected": -1230.0, + "loss": 0.4158, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.57421875, + "rewards/margins": 7.6328125, + "rewards/rejected": -5.0546875, + "step": 1553 + }, + { + "epoch": 0.3083486284041867, + "grad_norm": 47.685666345973395, + "learning_rate": 8.863683956435251e-07, + "logits/chosen": 3.63671875, + "logits/rejected": 3.921875, + "logps/chosen": -1007.0, + "logps/rejected": -1009.0, + "loss": 0.4566, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.2607421875, + "rewards/margins": 6.765625, + "rewards/rejected": -5.5, + "step": 1554 + }, + { + "epoch": 0.30854705094498736, + "grad_norm": 38.18939562287165, + "learning_rate": 8.861612820584482e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.17578125, + "logps/chosen": -931.0, + "logps/rejected": -1128.0, + "loss": 0.5476, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.76611328125, + "rewards/margins": 5.8046875, + "rewards/rejected": -5.0390625, + "step": 1555 + }, + { + "epoch": 0.30874547348578796, + "grad_norm": 32.17396238762325, + "learning_rate": 8.859540072227754e-07, + "logits/chosen": 3.77734375, + "logits/rejected": 3.6015625, + "logps/chosen": -994.0, + "logps/rejected": -581.5, + "loss": 0.4484, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.123046875, + "rewards/margins": 6.1640625, + "rewards/rejected": -4.03515625, + "step": 1556 + }, + { + "epoch": 0.3089438960265886, + "grad_norm": 37.004609370994515, + "learning_rate": 8.857465712359331e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 4.609375, + "logps/chosen": -819.0, + "logps/rejected": -821.0, + "loss": 0.4138, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.939453125, + "rewards/margins": 6.96875, + "rewards/rejected": -5.01953125, + "step": 1557 + }, + { + "epoch": 0.3091423185673893, + "grad_norm": 36.263357059787005, + "learning_rate": 8.855389741974244e-07, + "logits/chosen": 4.78125, + "logits/rejected": 4.56640625, + "logps/chosen": -956.0, + "logps/rejected": -874.0, + "loss": 0.4972, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4248046875, + "rewards/margins": 5.8828125, + "rewards/rejected": -4.46875, + "step": 1558 + }, + { + "epoch": 0.3093407411081899, + "grad_norm": 43.737306268815004, + "learning_rate": 8.853312162068302e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.08203125, + "logps/chosen": -1031.0, + "logps/rejected": -805.0, + "loss": 0.4194, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.98828125, + "rewards/margins": 6.3828125, + "rewards/rejected": -4.390625, + "step": 1559 + }, + { + "epoch": 0.30953916364899053, + "grad_norm": 34.99727393796585, + "learning_rate": 8.851232973638082e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.36328125, + "logps/chosen": -1273.0, + "logps/rejected": -892.5, + "loss": 0.4003, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.400390625, + "rewards/margins": 7.65625, + "rewards/rejected": -5.23828125, + "step": 1560 + }, + { + "epoch": 0.30973758618979114, + "grad_norm": 32.428227715630484, + "learning_rate": 8.849152177680932e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 4.203125, + "logps/chosen": -970.0, + "logps/rejected": -656.0, + "loss": 0.3615, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.86572265625, + "rewards/margins": 6.0703125, + "rewards/rejected": -4.21875, + "step": 1561 + }, + { + "epoch": 0.3099360087305918, + "grad_norm": 33.31011889803692, + "learning_rate": 8.847069775194976e-07, + "logits/chosen": 4.35546875, + "logits/rejected": 4.28515625, + "logps/chosen": -1082.0, + "logps/rejected": -1020.0, + "loss": 0.4744, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.134765625, + "rewards/margins": 5.09375, + "rewards/rejected": -2.958984375, + "step": 1562 + }, + { + "epoch": 0.31013443127139245, + "grad_norm": 31.485258883783363, + "learning_rate": 8.844985767179101e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 4.17578125, + "logps/chosen": -1086.0, + "logps/rejected": -673.5, + "loss": 0.4427, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.2890625, + "rewards/margins": 6.1171875, + "rewards/rejected": -3.82421875, + "step": 1563 + }, + { + "epoch": 0.31033285381219305, + "grad_norm": 35.8122843483905, + "learning_rate": 8.842900154632974e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.1015625, + "logps/chosen": -1002.0, + "logps/rejected": -763.0, + "loss": 0.3726, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.169921875, + "rewards/margins": 7.0859375, + "rewards/rejected": -4.92578125, + "step": 1564 + }, + { + "epoch": 0.3105312763529937, + "grad_norm": 38.78289704529296, + "learning_rate": 8.840812938557022e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.50390625, + "logps/chosen": -1232.0, + "logps/rejected": -885.0, + "loss": 0.3708, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.189453125, + "rewards/margins": 8.578125, + "rewards/rejected": -5.37890625, + "step": 1565 + }, + { + "epoch": 0.3107296988937943, + "grad_norm": 30.42947957597647, + "learning_rate": 8.838724119952448e-07, + "logits/chosen": 4.29296875, + "logits/rejected": 3.9609375, + "logps/chosen": -1166.0, + "logps/rejected": -694.5, + "loss": 0.3922, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.45703125, + "rewards/margins": 6.86328125, + "rewards/rejected": -4.41015625, + "step": 1566 + }, + { + "epoch": 0.310928121434595, + "grad_norm": 33.2184146381582, + "learning_rate": 8.83663369982122e-07, + "logits/chosen": 3.984375, + "logits/rejected": 3.9453125, + "logps/chosen": -973.0, + "logps/rejected": -643.5, + "loss": 0.3186, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.244140625, + "rewards/margins": 7.125, + "rewards/rejected": -4.87109375, + "step": 1567 + }, + { + "epoch": 0.31112654397539563, + "grad_norm": 37.56872026527501, + "learning_rate": 8.834541679166076e-07, + "logits/chosen": 4.34765625, + "logits/rejected": 4.1875, + "logps/chosen": -768.5, + "logps/rejected": -582.0, + "loss": 0.5968, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.43359375, + "rewards/margins": 4.8828125, + "rewards/rejected": -3.4521484375, + "step": 1568 + }, + { + "epoch": 0.31132496651619623, + "grad_norm": 30.73049130783412, + "learning_rate": 8.832448058990521e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.0390625, + "logps/chosen": -1147.0, + "logps/rejected": -706.0, + "loss": 0.429, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.39453125, + "rewards/margins": 6.375, + "rewards/rejected": -4.9765625, + "step": 1569 + }, + { + "epoch": 0.3115233890569969, + "grad_norm": 33.13408262204491, + "learning_rate": 8.830352840298826e-07, + "logits/chosen": 4.328125, + "logits/rejected": 4.2578125, + "logps/chosen": -1309.0, + "logps/rejected": -956.0, + "loss": 0.5009, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.234375, + "rewards/margins": 6.0859375, + "rewards/rejected": -3.84375, + "step": 1570 + }, + { + "epoch": 0.3117218115977975, + "grad_norm": 34.51851669226963, + "learning_rate": 8.828256024096034e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.0546875, + "logps/chosen": -942.0, + "logps/rejected": -648.5, + "loss": 0.5766, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.478515625, + "rewards/margins": 4.5703125, + "rewards/rejected": -3.09375, + "step": 1571 + }, + { + "epoch": 0.31192023413859815, + "grad_norm": 39.06488869797547, + "learning_rate": 8.826157611387945e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.203125, + "logps/chosen": -971.5, + "logps/rejected": -606.0, + "loss": 0.433, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.65234375, + "rewards/margins": 5.796875, + "rewards/rejected": -3.1484375, + "step": 1572 + }, + { + "epoch": 0.31211865667939875, + "grad_norm": 35.16759842048071, + "learning_rate": 8.824057603181135e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 4.625, + "logps/chosen": -1156.0, + "logps/rejected": -1430.0, + "loss": 0.3871, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.384765625, + "rewards/margins": 7.8984375, + "rewards/rejected": -6.5078125, + "step": 1573 + }, + { + "epoch": 0.3123170792201994, + "grad_norm": 34.946554920662784, + "learning_rate": 8.821956000482938e-07, + "logits/chosen": 4.49609375, + "logits/rejected": 4.51953125, + "logps/chosen": -1410.0, + "logps/rejected": -1030.0, + "loss": 0.4878, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.080078125, + "rewards/margins": 5.28125, + "rewards/rejected": -4.21484375, + "step": 1574 + }, + { + "epoch": 0.31251550176100007, + "grad_norm": 29.15307825007122, + "learning_rate": 8.819852804301457e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 4.19140625, + "logps/chosen": -1106.0, + "logps/rejected": -660.5, + "loss": 0.4462, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.841796875, + "rewards/margins": 6.7890625, + "rewards/rejected": -3.94140625, + "step": 1575 + }, + { + "epoch": 0.31271392430180067, + "grad_norm": 34.135241392152274, + "learning_rate": 8.817748015645558e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.12890625, + "logps/chosen": -961.0, + "logps/rejected": -1012.5, + "loss": 0.5597, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.7734375, + "rewards/margins": 6.625, + "rewards/rejected": -4.8515625, + "step": 1576 + }, + { + "epoch": 0.31291234684260133, + "grad_norm": 41.57274986097788, + "learning_rate": 8.815641635524868e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 3.71484375, + "logps/chosen": -1088.0, + "logps/rejected": -678.0, + "loss": 0.389, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.185546875, + "rewards/margins": 7.0703125, + "rewards/rejected": -4.875, + "step": 1577 + }, + { + "epoch": 0.31311076938340193, + "grad_norm": 37.52025533038154, + "learning_rate": 8.813533664949784e-07, + "logits/chosen": 4.3984375, + "logits/rejected": 4.6015625, + "logps/chosen": -806.0, + "logps/rejected": -541.5, + "loss": 0.5299, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.0673828125, + "rewards/margins": 5.2421875, + "rewards/rejected": -4.17578125, + "step": 1578 + }, + { + "epoch": 0.3133091919242026, + "grad_norm": 44.19452941187138, + "learning_rate": 8.81142410493146e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.2890625, + "logps/chosen": -1174.0, + "logps/rejected": -587.0, + "loss": 0.4968, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.2578125, + "rewards/margins": 7.640625, + "rewards/rejected": -5.3828125, + "step": 1579 + }, + { + "epoch": 0.31350761446500325, + "grad_norm": 42.91763385577145, + "learning_rate": 8.809312956481815e-07, + "logits/chosen": 4.26171875, + "logits/rejected": 3.8984375, + "logps/chosen": -1230.0, + "logps/rejected": -681.5, + "loss": 0.4338, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.36962890625, + "rewards/margins": 6.9765625, + "rewards/rejected": -5.609375, + "step": 1580 + }, + { + "epoch": 0.31370603700580385, + "grad_norm": 33.114797268213295, + "learning_rate": 8.807200220613527e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.2890625, + "logps/chosen": -1468.0, + "logps/rejected": -835.0, + "loss": 0.3313, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.39404296875, + "rewards/margins": 8.109375, + "rewards/rejected": -6.7109375, + "step": 1581 + }, + { + "epoch": 0.3139044595466045, + "grad_norm": 35.63860188774205, + "learning_rate": 8.80508589834004e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 3.984375, + "logps/chosen": -983.0, + "logps/rejected": -1427.0, + "loss": 0.3637, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.26171875, + "rewards/margins": 8.53125, + "rewards/rejected": -6.265625, + "step": 1582 + }, + { + "epoch": 0.3141028820874051, + "grad_norm": 38.498112754549446, + "learning_rate": 8.802969990675558e-07, + "logits/chosen": 3.83984375, + "logits/rejected": 3.6328125, + "logps/chosen": -714.5, + "logps/rejected": -680.5, + "loss": 0.4797, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.419921875, + "rewards/margins": 4.921875, + "rewards/rejected": -3.515625, + "step": 1583 + }, + { + "epoch": 0.31430130462820577, + "grad_norm": 36.570589099484096, + "learning_rate": 8.800852498635042e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 3.87890625, + "logps/chosen": -643.5, + "logps/rejected": -550.5, + "loss": 0.4194, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.849609375, + "rewards/margins": 6.6015625, + "rewards/rejected": -4.75390625, + "step": 1584 + }, + { + "epoch": 0.3144997271690064, + "grad_norm": 33.045560482976164, + "learning_rate": 8.798733423234219e-07, + "logits/chosen": 4.23046875, + "logits/rejected": 4.1953125, + "logps/chosen": -1102.0, + "logps/rejected": -679.5, + "loss": 0.4158, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.583984375, + "rewards/margins": 6.7578125, + "rewards/rejected": -4.1640625, + "step": 1585 + }, + { + "epoch": 0.31469814970980703, + "grad_norm": 33.81063775186394, + "learning_rate": 8.796612765489567e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.29296875, + "logps/chosen": -1071.0, + "logps/rejected": -706.0, + "loss": 0.4386, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3603515625, + "rewards/margins": 6.6484375, + "rewards/rejected": -4.29296875, + "step": 1586 + }, + { + "epoch": 0.3148965722506077, + "grad_norm": 32.75913832990661, + "learning_rate": 8.79449052641833e-07, + "logits/chosen": 4.23046875, + "logits/rejected": 4.5234375, + "logps/chosen": -933.0, + "logps/rejected": -1362.0, + "loss": 0.4666, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.083984375, + "rewards/margins": 7.87109375, + "rewards/rejected": -5.794921875, + "step": 1587 + }, + { + "epoch": 0.3150949947914083, + "grad_norm": 33.05903601407344, + "learning_rate": 8.792366707038508e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.125, + "logps/chosen": -965.0, + "logps/rejected": -786.0, + "loss": 0.4212, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3203125, + "rewards/margins": 7.765625, + "rewards/rejected": -5.44921875, + "step": 1588 + }, + { + "epoch": 0.31529341733220895, + "grad_norm": 28.926229932718527, + "learning_rate": 8.790241308368861e-07, + "logits/chosen": 4.55078125, + "logits/rejected": 4.375, + "logps/chosen": -696.5, + "logps/rejected": -621.0, + "loss": 0.3411, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5625, + "rewards/margins": 5.9921875, + "rewards/rejected": -3.4375, + "step": 1589 + }, + { + "epoch": 0.31549183987300955, + "grad_norm": 36.90091830823745, + "learning_rate": 8.788114331428903e-07, + "logits/chosen": 4.19921875, + "logits/rejected": 4.1796875, + "logps/chosen": -1216.0, + "logps/rejected": -744.5, + "loss": 0.4386, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.162109375, + "rewards/margins": 6.80078125, + "rewards/rejected": -4.630859375, + "step": 1590 + }, + { + "epoch": 0.3156902624138102, + "grad_norm": 31.622050946253697, + "learning_rate": 8.785985777238908e-07, + "logits/chosen": 4.59375, + "logits/rejected": 4.23828125, + "logps/chosen": -1280.0, + "logps/rejected": -659.5, + "loss": 0.2494, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.96875, + "rewards/margins": 7.171875, + "rewards/rejected": -4.193359375, + "step": 1591 + }, + { + "epoch": 0.31588868495461087, + "grad_norm": 36.409155520106005, + "learning_rate": 8.783855646819906e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 4.10546875, + "logps/chosen": -847.5, + "logps/rejected": -775.5, + "loss": 0.4902, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.5048828125, + "rewards/margins": 5.109375, + "rewards/rejected": -3.607421875, + "step": 1592 + }, + { + "epoch": 0.31608710749541147, + "grad_norm": 35.65838898759721, + "learning_rate": 8.781723941193683e-07, + "logits/chosen": 4.609375, + "logits/rejected": 4.765625, + "logps/chosen": -809.0, + "logps/rejected": -1033.0, + "loss": 0.4636, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.982421875, + "rewards/margins": 7.2421875, + "rewards/rejected": -5.26953125, + "step": 1593 + }, + { + "epoch": 0.3162855300362121, + "grad_norm": 28.62817958986653, + "learning_rate": 8.779590661382778e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.25, + "logps/chosen": -1244.0, + "logps/rejected": -880.0, + "loss": 0.4498, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.521484375, + "rewards/margins": 6.8984375, + "rewards/rejected": -4.3828125, + "step": 1594 + }, + { + "epoch": 0.31648395257701273, + "grad_norm": 33.36040845246024, + "learning_rate": 8.777455808410489e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 4.046875, + "logps/chosen": -928.0, + "logps/rejected": -1497.0, + "loss": 0.4447, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.09765625, + "rewards/margins": 7.48046875, + "rewards/rejected": -5.3671875, + "step": 1595 + }, + { + "epoch": 0.3166823751178134, + "grad_norm": 29.11930885436569, + "learning_rate": 8.775319383300869e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.4375, + "logps/chosen": -687.0, + "logps/rejected": -771.0, + "loss": 0.44, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.634765625, + "rewards/margins": 5.50390625, + "rewards/rejected": -3.87890625, + "step": 1596 + }, + { + "epoch": 0.31688079765861404, + "grad_norm": 35.69245893213206, + "learning_rate": 8.773181387078719e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 4.2890625, + "logps/chosen": -682.5, + "logps/rejected": -613.0, + "loss": 0.4608, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.80859375, + "rewards/margins": 5.6875, + "rewards/rejected": -3.87109375, + "step": 1597 + }, + { + "epoch": 0.31707922019941465, + "grad_norm": 28.876477586024745, + "learning_rate": 8.771041820769601e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 3.98046875, + "logps/chosen": -1131.0, + "logps/rejected": -712.0, + "loss": 0.4125, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.123046875, + "rewards/margins": 5.84375, + "rewards/rejected": -3.73828125, + "step": 1598 + }, + { + "epoch": 0.3172776427402153, + "grad_norm": 37.07106035214563, + "learning_rate": 8.768900685399825e-07, + "logits/chosen": 4.52734375, + "logits/rejected": 4.4921875, + "logps/chosen": -879.5, + "logps/rejected": -670.0, + "loss": 0.419, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.2109375, + "rewards/margins": 6.9375, + "rewards/rejected": -4.734375, + "step": 1599 + }, + { + "epoch": 0.3174760652810159, + "grad_norm": 43.66742390904757, + "learning_rate": 8.766757981996458e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.16015625, + "logps/chosen": -850.0, + "logps/rejected": -554.5, + "loss": 0.3601, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.92578125, + "rewards/margins": 5.8828125, + "rewards/rejected": -3.9453125, + "step": 1600 + }, + { + "epoch": 0.31767448782181656, + "grad_norm": 34.202802246530965, + "learning_rate": 8.764613711587315e-07, + "logits/chosen": 4.1875, + "logits/rejected": 3.953125, + "logps/chosen": -872.0, + "logps/rejected": -576.0, + "loss": 0.5071, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.107421875, + "rewards/margins": 4.71875, + "rewards/rejected": -2.611328125, + "step": 1601 + }, + { + "epoch": 0.31787291036261717, + "grad_norm": 37.23446423416938, + "learning_rate": 8.762467875200965e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 3.96875, + "logps/chosen": -1169.0, + "logps/rejected": -734.0, + "loss": 0.4699, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1015625, + "rewards/margins": 5.625, + "rewards/rejected": -3.51953125, + "step": 1602 + }, + { + "epoch": 0.3180713329034178, + "grad_norm": 39.51516456623151, + "learning_rate": 8.760320473866727e-07, + "logits/chosen": 4.31640625, + "logits/rejected": 4.05859375, + "logps/chosen": -2111.0, + "logps/rejected": -732.5, + "loss": 0.5365, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4453125, + "rewards/margins": 3.875, + "rewards/rejected": -4.31640625, + "step": 1603 + }, + { + "epoch": 0.3182697554442185, + "grad_norm": 34.78184573590274, + "learning_rate": 8.758171508614671e-07, + "logits/chosen": 4.078125, + "logits/rejected": 3.8046875, + "logps/chosen": -646.5, + "logps/rejected": -512.75, + "loss": 0.4261, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.998046875, + "rewards/margins": 5.6484375, + "rewards/rejected": -3.65234375, + "step": 1604 + }, + { + "epoch": 0.3184681779850191, + "grad_norm": 34.47673489406134, + "learning_rate": 8.756020980475619e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.17578125, + "logps/chosen": -1090.5, + "logps/rejected": -785.0, + "loss": 0.3424, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.603515625, + "rewards/margins": 7.8984375, + "rewards/rejected": -5.3046875, + "step": 1605 + }, + { + "epoch": 0.31866660052581974, + "grad_norm": 116.8851027862789, + "learning_rate": 8.753868890481141e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 4.18359375, + "logps/chosen": -983.0, + "logps/rejected": -752.5, + "loss": 0.4016, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.609375, + "rewards/margins": 6.8515625, + "rewards/rejected": -4.23828125, + "step": 1606 + }, + { + "epoch": 0.31886502306662035, + "grad_norm": 27.93961326019036, + "learning_rate": 8.751715239663554e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 3.69921875, + "logps/chosen": -1083.0, + "logps/rejected": -697.0, + "loss": 0.3551, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5078125, + "rewards/margins": 6.90625, + "rewards/rejected": -4.3984375, + "step": 1607 + }, + { + "epoch": 0.319063445607421, + "grad_norm": 35.37137617844516, + "learning_rate": 8.749560029055928e-07, + "logits/chosen": 3.953125, + "logits/rejected": 4.1640625, + "logps/chosen": -659.0, + "logps/rejected": -1928.0, + "loss": 0.4005, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.138671875, + "rewards/margins": 8.15234375, + "rewards/rejected": -6.015625, + "step": 1608 + }, + { + "epoch": 0.31926186814822166, + "grad_norm": 39.9560459191062, + "learning_rate": 8.747403259692078e-07, + "logits/chosen": 3.9375, + "logits/rejected": 4.08984375, + "logps/chosen": -1085.0, + "logps/rejected": -867.0, + "loss": 0.3926, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.529296875, + "rewards/margins": 7.3828125, + "rewards/rejected": -4.84375, + "step": 1609 + }, + { + "epoch": 0.31946029068902226, + "grad_norm": 35.88044468304983, + "learning_rate": 8.745244932606571e-07, + "logits/chosen": 4.453125, + "logits/rejected": 4.296875, + "logps/chosen": -950.0, + "logps/rejected": -682.0, + "loss": 0.5045, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.5634765625, + "rewards/margins": 6.140625, + "rewards/rejected": -4.57421875, + "step": 1610 + }, + { + "epoch": 0.3196587132298229, + "grad_norm": 35.565242759709875, + "learning_rate": 8.743085048834715e-07, + "logits/chosen": 4.5546875, + "logits/rejected": 4.4453125, + "logps/chosen": -1143.0, + "logps/rejected": -986.0, + "loss": 0.3465, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.705078125, + "rewards/margins": 9.0859375, + "rewards/rejected": -6.390625, + "step": 1611 + }, + { + "epoch": 0.3198571357706235, + "grad_norm": 40.873273086652276, + "learning_rate": 8.740923609412569e-07, + "logits/chosen": 3.51171875, + "logits/rejected": 3.6484375, + "logps/chosen": -1178.0, + "logps/rejected": -812.0, + "loss": 0.4997, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.51416015625, + "rewards/margins": 7.015625, + "rewards/rejected": -5.5078125, + "step": 1612 + }, + { + "epoch": 0.3200555583114242, + "grad_norm": 27.086397294987794, + "learning_rate": 8.738760615376938e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 4.265625, + "logps/chosen": -1036.0, + "logps/rejected": -560.5, + "loss": 0.3429, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.896484375, + "rewards/margins": 7.3671875, + "rewards/rejected": -4.47265625, + "step": 1613 + }, + { + "epoch": 0.32025398085222484, + "grad_norm": 31.7229793833966, + "learning_rate": 8.736596067765368e-07, + "logits/chosen": 4.37109375, + "logits/rejected": 4.015625, + "logps/chosen": -1251.0, + "logps/rejected": -1087.0, + "loss": 0.3579, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.43359375, + "rewards/margins": 10.0390625, + "rewards/rejected": -7.59375, + "step": 1614 + }, + { + "epoch": 0.32045240339302544, + "grad_norm": 36.61354550985506, + "learning_rate": 8.734429967616158e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.0078125, + "logps/chosen": -941.0, + "logps/rejected": -638.0, + "loss": 0.3486, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.767578125, + "rewards/margins": 7.3671875, + "rewards/rejected": -5.5859375, + "step": 1615 + }, + { + "epoch": 0.3206508259338261, + "grad_norm": 38.681075105153354, + "learning_rate": 8.732262315968348e-07, + "logits/chosen": 3.66796875, + "logits/rejected": 3.8046875, + "logps/chosen": -1276.0, + "logps/rejected": -663.5, + "loss": 0.6754, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.85546875, + "rewards/margins": 2.470703125, + "rewards/rejected": -3.3251953125, + "step": 1616 + }, + { + "epoch": 0.3208492484746267, + "grad_norm": 35.06897138381646, + "learning_rate": 8.73009311386172e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 4.421875, + "logps/chosen": -1086.0, + "logps/rejected": -847.0, + "loss": 0.3507, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4609375, + "rewards/margins": 7.734375, + "rewards/rejected": -5.26171875, + "step": 1617 + }, + { + "epoch": 0.32104767101542736, + "grad_norm": 32.157773707771966, + "learning_rate": 8.727922362336801e-07, + "logits/chosen": 3.73046875, + "logits/rejected": 3.7265625, + "logps/chosen": -909.0, + "logps/rejected": -684.0, + "loss": 0.512, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.205078125, + "rewards/margins": 6.171875, + "rewards/rejected": -4.95703125, + "step": 1618 + }, + { + "epoch": 0.32124609355622796, + "grad_norm": 32.42827405292003, + "learning_rate": 8.725750062434866e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.04296875, + "logps/chosen": -1386.0, + "logps/rejected": -890.0, + "loss": 0.2929, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.828125, + "rewards/margins": 9.3203125, + "rewards/rejected": -6.4921875, + "step": 1619 + }, + { + "epoch": 0.3214445160970286, + "grad_norm": 28.609916390817805, + "learning_rate": 8.723576215197923e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 3.98046875, + "logps/chosen": -1202.5, + "logps/rejected": -682.0, + "loss": 0.3649, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.7216796875, + "rewards/margins": 6.984375, + "rewards/rejected": -5.25, + "step": 1620 + }, + { + "epoch": 0.3216429386378293, + "grad_norm": 34.19226412091139, + "learning_rate": 8.721400821668733e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.88671875, + "logps/chosen": -1029.0, + "logps/rejected": -838.0, + "loss": 0.4785, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.08740234375, + "rewards/margins": 6.921875, + "rewards/rejected": -5.84375, + "step": 1621 + }, + { + "epoch": 0.3218413611786299, + "grad_norm": 32.3650623180219, + "learning_rate": 8.719223882890791e-07, + "logits/chosen": 3.828125, + "logits/rejected": 3.73046875, + "logps/chosen": -1044.0, + "logps/rejected": -803.0, + "loss": 0.4272, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.619140625, + "rewards/margins": 7.0625, + "rewards/rejected": -4.4296875, + "step": 1622 + }, + { + "epoch": 0.32203978371943054, + "grad_norm": 39.40019947657762, + "learning_rate": 8.717045399908335e-07, + "logits/chosen": 4.078125, + "logits/rejected": 4.25390625, + "logps/chosen": -1023.5, + "logps/rejected": -750.5, + "loss": 0.5162, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.56640625, + "rewards/margins": 5.53125, + "rewards/rejected": -3.9609375, + "step": 1623 + }, + { + "epoch": 0.32223820626023114, + "grad_norm": 39.31600785854607, + "learning_rate": 8.714865373766348e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 3.98828125, + "logps/chosen": -950.0, + "logps/rejected": -1013.0, + "loss": 0.3628, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.8525390625, + "rewards/margins": 10.3203125, + "rewards/rejected": -8.4609375, + "step": 1624 + }, + { + "epoch": 0.3224366288010318, + "grad_norm": 46.18882875184133, + "learning_rate": 8.712683805510545e-07, + "logits/chosen": 4.4921875, + "logits/rejected": 4.53125, + "logps/chosen": -1008.0, + "logps/rejected": -720.0, + "loss": 0.5632, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.67578125, + "rewards/margins": 4.58203125, + "rewards/rejected": -2.9140625, + "step": 1625 + }, + { + "epoch": 0.32263505134183246, + "grad_norm": 34.98576995104795, + "learning_rate": 8.710500696187391e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.89453125, + "logps/chosen": -983.0, + "logps/rejected": -772.0, + "loss": 0.4543, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.095703125, + "rewards/margins": 5.76171875, + "rewards/rejected": -3.658203125, + "step": 1626 + }, + { + "epoch": 0.32283347388263306, + "grad_norm": 38.64820286843999, + "learning_rate": 8.70831604684408e-07, + "logits/chosen": 4.58203125, + "logits/rejected": 4.08203125, + "logps/chosen": -1266.0, + "logps/rejected": -704.0, + "loss": 0.3719, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.30859375, + "rewards/margins": 7.0390625, + "rewards/rejected": -4.73046875, + "step": 1627 + }, + { + "epoch": 0.3230318964234337, + "grad_norm": 30.60276446801776, + "learning_rate": 8.706129858528551e-07, + "logits/chosen": 4.625, + "logits/rejected": 4.6640625, + "logps/chosen": -800.0, + "logps/rejected": -619.0, + "loss": 0.3681, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.986572265625, + "rewards/margins": 6.34765625, + "rewards/rejected": -4.34765625, + "step": 1628 + }, + { + "epoch": 0.3232303189642343, + "grad_norm": 40.355199618154955, + "learning_rate": 8.703942132289484e-07, + "logits/chosen": 4.32421875, + "logits/rejected": 4.43359375, + "logps/chosen": -892.0, + "logps/rejected": -588.0, + "loss": 0.5317, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.921875, + "rewards/margins": 4.296875, + "rewards/rejected": -2.37109375, + "step": 1629 + }, + { + "epoch": 0.323428741505035, + "grad_norm": 41.886395757089076, + "learning_rate": 8.701752869176286e-07, + "logits/chosen": 4.19921875, + "logits/rejected": 4.33203125, + "logps/chosen": -878.0, + "logps/rejected": -1337.5, + "loss": 0.5089, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.322021484375, + "rewards/margins": 6.84765625, + "rewards/rejected": -5.525390625, + "step": 1630 + }, + { + "epoch": 0.3236271640458356, + "grad_norm": 39.1236158987703, + "learning_rate": 8.699562070239109e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.1953125, + "logps/chosen": -701.0, + "logps/rejected": -466.0, + "loss": 0.4335, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.96875, + "rewards/margins": 4.9140625, + "rewards/rejected": -2.9453125, + "step": 1631 + }, + { + "epoch": 0.32382558658663624, + "grad_norm": 33.376802571856516, + "learning_rate": 8.697369736528844e-07, + "logits/chosen": 4.5078125, + "logits/rejected": 4.25390625, + "logps/chosen": -947.0, + "logps/rejected": -611.0, + "loss": 0.5038, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8330078125, + "rewards/margins": 5.89453125, + "rewards/rejected": -4.060546875, + "step": 1632 + }, + { + "epoch": 0.3240240091274369, + "grad_norm": 39.75710706521513, + "learning_rate": 8.695175869097111e-07, + "logits/chosen": 4.09375, + "logits/rejected": 4.296875, + "logps/chosen": -1260.0, + "logps/rejected": -1751.0, + "loss": 0.3045, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.84375, + "rewards/margins": 10.203125, + "rewards/rejected": -7.359375, + "step": 1633 + }, + { + "epoch": 0.3242224316682375, + "grad_norm": 27.62990630744432, + "learning_rate": 8.69298046899627e-07, + "logits/chosen": 4.078125, + "logits/rejected": 4.2265625, + "logps/chosen": -896.0, + "logps/rejected": -641.0, + "loss": 0.3381, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.80078125, + "rewards/margins": 7.4140625, + "rewards/rejected": -5.60546875, + "step": 1634 + }, + { + "epoch": 0.32442085420903816, + "grad_norm": 35.10598751174298, + "learning_rate": 8.690783537279417e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 4.1015625, + "logps/chosen": -684.0, + "logps/rejected": -547.5, + "loss": 0.4353, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.505859375, + "rewards/margins": 6.0703125, + "rewards/rejected": -4.578125, + "step": 1635 + }, + { + "epoch": 0.32461927674983876, + "grad_norm": 39.050782538241656, + "learning_rate": 8.68858507500038e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.140625, + "logps/chosen": -1216.0, + "logps/rejected": -1565.0, + "loss": 0.4849, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5439453125, + "rewards/margins": 8.9453125, + "rewards/rejected": -6.40625, + "step": 1636 + }, + { + "epoch": 0.3248176992906394, + "grad_norm": 40.82506394620909, + "learning_rate": 8.686385083213722e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.0703125, + "logps/chosen": -1179.0, + "logps/rejected": -637.0, + "loss": 0.4381, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.484375, + "rewards/margins": 5.40625, + "rewards/rejected": -3.91796875, + "step": 1637 + }, + { + "epoch": 0.3250161218314401, + "grad_norm": 37.50918126661096, + "learning_rate": 8.68418356297474e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.91796875, + "logps/chosen": -869.0, + "logps/rejected": -865.0, + "loss": 0.5005, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.2060546875, + "rewards/margins": 5.7734375, + "rewards/rejected": -4.55859375, + "step": 1638 + }, + { + "epoch": 0.3252145443722407, + "grad_norm": 35.159015774516824, + "learning_rate": 8.681980515339463e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.5390625, + "logps/chosen": -670.0, + "logps/rejected": -749.5, + "loss": 0.5296, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.591796875, + "rewards/margins": 10.91015625, + "rewards/rejected": -9.296875, + "step": 1639 + }, + { + "epoch": 0.32541296691304133, + "grad_norm": 37.63611162421852, + "learning_rate": 8.679775941364657e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.19921875, + "logps/chosen": -1147.0, + "logps/rejected": -906.0, + "loss": 0.4318, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.345703125, + "rewards/margins": 8.1171875, + "rewards/rejected": -5.78125, + "step": 1640 + }, + { + "epoch": 0.32561138945384194, + "grad_norm": 33.80855432145968, + "learning_rate": 8.677569842107815e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 3.94921875, + "logps/chosen": -1161.0, + "logps/rejected": -606.5, + "loss": 0.4168, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.33984375, + "rewards/margins": 6.625, + "rewards/rejected": -4.296875, + "step": 1641 + }, + { + "epoch": 0.3258098119946426, + "grad_norm": 49.35696150467958, + "learning_rate": 8.675362218627162e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.1875, + "logps/chosen": -1195.0, + "logps/rejected": -626.5, + "loss": 0.4782, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.470703125, + "rewards/margins": 5.41796875, + "rewards/rejected": -3.953125, + "step": 1642 + }, + { + "epoch": 0.32600823453544325, + "grad_norm": 35.42375978879802, + "learning_rate": 8.673153071981659e-07, + "logits/chosen": 4.42578125, + "logits/rejected": 4.34765625, + "logps/chosen": -771.5, + "logps/rejected": -557.5, + "loss": 0.5347, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.81201171875, + "rewards/margins": 5.544921875, + "rewards/rejected": -3.71875, + "step": 1643 + }, + { + "epoch": 0.32620665707624386, + "grad_norm": 38.89824226839647, + "learning_rate": 8.670942403230994e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.234375, + "logps/chosen": -955.0, + "logps/rejected": -689.0, + "loss": 0.4766, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.251678466796875, + "rewards/margins": 5.90625, + "rewards/rejected": -4.66015625, + "step": 1644 + }, + { + "epoch": 0.3264050796170445, + "grad_norm": 35.21811229152537, + "learning_rate": 8.668730213435582e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.4140625, + "logps/chosen": -1089.0, + "logps/rejected": -876.0, + "loss": 0.3242, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.85546875, + "rewards/margins": 8.1171875, + "rewards/rejected": -5.24609375, + "step": 1645 + }, + { + "epoch": 0.3266035021578451, + "grad_norm": 36.72011192161924, + "learning_rate": 8.666516503656576e-07, + "logits/chosen": 4.25, + "logits/rejected": 3.87890625, + "logps/chosen": -1121.5, + "logps/rejected": -517.75, + "loss": 0.4947, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.595703125, + "rewards/margins": 5.0390625, + "rewards/rejected": -3.447265625, + "step": 1646 + }, + { + "epoch": 0.3268019246986458, + "grad_norm": 38.03790132731791, + "learning_rate": 8.66430127495585e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.10546875, + "logps/chosen": -1051.0, + "logps/rejected": -802.0, + "loss": 0.5128, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.61328125, + "rewards/margins": 5.671875, + "rewards/rejected": -4.0625, + "step": 1647 + }, + { + "epoch": 0.3270003472394464, + "grad_norm": 27.475229761954363, + "learning_rate": 8.662084528396011e-07, + "logits/chosen": 3.90625, + "logits/rejected": 4.078125, + "logps/chosen": -860.0, + "logps/rejected": -586.5, + "loss": 0.3389, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.30078125, + "rewards/margins": 7.3125, + "rewards/rejected": -5.01171875, + "step": 1648 + }, + { + "epoch": 0.32719876978024703, + "grad_norm": 41.42152583831772, + "learning_rate": 8.659866265040392e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.3046875, + "logps/chosen": -1002.0, + "logps/rejected": -807.0, + "loss": 0.4123, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.349609375, + "rewards/margins": 7.015625, + "rewards/rejected": -4.6640625, + "step": 1649 + }, + { + "epoch": 0.3273971923210477, + "grad_norm": 43.292719426686524, + "learning_rate": 8.657646485953056e-07, + "logits/chosen": 4.7109375, + "logits/rejected": 4.4296875, + "logps/chosen": -901.0, + "logps/rejected": -762.0, + "loss": 0.5033, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.26171875, + "rewards/margins": 6.13671875, + "rewards/rejected": -3.8828125, + "step": 1650 + }, + { + "epoch": 0.3275956148618483, + "grad_norm": 28.582368310919488, + "learning_rate": 8.655425192198789e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.09765625, + "logps/chosen": -1026.0, + "logps/rejected": -608.5, + "loss": 0.3728, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.03125, + "rewards/margins": 6.609375, + "rewards/rejected": -4.5703125, + "step": 1651 + }, + { + "epoch": 0.32779403740264895, + "grad_norm": 35.28166597126321, + "learning_rate": 8.65320238484311e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.09765625, + "logps/chosen": -1255.0, + "logps/rejected": -1647.0, + "loss": 0.3595, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.8544921875, + "rewards/margins": 9.015625, + "rewards/rejected": -7.1484375, + "step": 1652 + }, + { + "epoch": 0.32799245994344955, + "grad_norm": 41.06332081356325, + "learning_rate": 8.650978064952258e-07, + "logits/chosen": 4.22265625, + "logits/rejected": 4.4375, + "logps/chosen": -1136.0, + "logps/rejected": -764.0, + "loss": 0.3986, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.93359375, + "rewards/margins": 5.5859375, + "rewards/rejected": -3.66015625, + "step": 1653 + }, + { + "epoch": 0.3281908824842502, + "grad_norm": 38.701416519809534, + "learning_rate": 8.648752233593199e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.4375, + "logps/chosen": -933.0, + "logps/rejected": -628.0, + "loss": 0.4865, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.857421875, + "rewards/margins": 5.59375, + "rewards/rejected": -3.73828125, + "step": 1654 + }, + { + "epoch": 0.32838930502505087, + "grad_norm": 32.38075377500863, + "learning_rate": 8.646524891833626e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 3.9765625, + "logps/chosen": -622.0, + "logps/rejected": -369.0, + "loss": 0.4788, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.3125, + "rewards/margins": 4.8125, + "rewards/rejected": -3.5, + "step": 1655 + }, + { + "epoch": 0.3285877275658515, + "grad_norm": 35.33765447836774, + "learning_rate": 8.644296040741955e-07, + "logits/chosen": 4.3515625, + "logits/rejected": 4.390625, + "logps/chosen": -844.0, + "logps/rejected": -575.0, + "loss": 0.4733, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.08984375, + "rewards/margins": 5.40625, + "rewards/rejected": -3.3203125, + "step": 1656 + }, + { + "epoch": 0.32878615010665213, + "grad_norm": 34.73466234707175, + "learning_rate": 8.642065681387327e-07, + "logits/chosen": 3.62890625, + "logits/rejected": 4.05078125, + "logps/chosen": -897.5, + "logps/rejected": -653.0, + "loss": 0.4784, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.951171875, + "rewards/margins": 5.66796875, + "rewards/rejected": -3.734375, + "step": 1657 + }, + { + "epoch": 0.32898457264745273, + "grad_norm": 25.83311369125833, + "learning_rate": 8.639833814839604e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.9375, + "logps/chosen": -882.0, + "logps/rejected": -847.0, + "loss": 0.4126, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.314453125, + "rewards/margins": 7.7578125, + "rewards/rejected": -5.4375, + "step": 1658 + }, + { + "epoch": 0.3291829951882534, + "grad_norm": 36.62623273942582, + "learning_rate": 8.637600442169374e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.1015625, + "logps/chosen": -798.0, + "logps/rejected": -1038.5, + "loss": 0.5103, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.484375, + "rewards/margins": 6.21875, + "rewards/rejected": -4.734375, + "step": 1659 + }, + { + "epoch": 0.329381417729054, + "grad_norm": 31.96509992244807, + "learning_rate": 8.635365564447947e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.19921875, + "logps/chosen": -1021.0, + "logps/rejected": -678.0, + "loss": 0.4176, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.99609375, + "rewards/margins": 6.8125, + "rewards/rejected": -3.8203125, + "step": 1660 + }, + { + "epoch": 0.32957984026985465, + "grad_norm": 37.3962197928056, + "learning_rate": 8.633129182747352e-07, + "logits/chosen": 4.515625, + "logits/rejected": 4.55078125, + "logps/chosen": -1027.0, + "logps/rejected": -745.5, + "loss": 0.479, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.90673828125, + "rewards/margins": 6.4140625, + "rewards/rejected": -4.5, + "step": 1661 + }, + { + "epoch": 0.3297782628106553, + "grad_norm": 37.748392399463235, + "learning_rate": 8.630891298140343e-07, + "logits/chosen": 3.9375, + "logits/rejected": 3.99609375, + "logps/chosen": -1216.0, + "logps/rejected": -881.0, + "loss": 0.3574, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5546875, + "rewards/margins": 6.625, + "rewards/rejected": -4.078125, + "step": 1662 + }, + { + "epoch": 0.3299766853514559, + "grad_norm": 30.667200941179257, + "learning_rate": 8.628651911700394e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 4.3125, + "logps/chosen": -1143.0, + "logps/rejected": -1390.0, + "loss": 0.3381, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4921875, + "rewards/margins": 9.1875, + "rewards/rejected": -6.6953125, + "step": 1663 + }, + { + "epoch": 0.33017510789225657, + "grad_norm": 45.73562014565805, + "learning_rate": 8.626411024501698e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 4.03125, + "logps/chosen": -1234.0, + "logps/rejected": -758.0, + "loss": 0.2999, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.69140625, + "rewards/margins": 7.0078125, + "rewards/rejected": -4.31640625, + "step": 1664 + }, + { + "epoch": 0.33037353043305717, + "grad_norm": 28.126415253079497, + "learning_rate": 8.624168637619167e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.84765625, + "logps/chosen": -1197.0, + "logps/rejected": -740.0, + "loss": 0.3302, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.208984375, + "rewards/margins": 7.3828125, + "rewards/rejected": -4.17578125, + "step": 1665 + }, + { + "epoch": 0.33057195297385783, + "grad_norm": 36.45451480347601, + "learning_rate": 8.621924752128437e-07, + "logits/chosen": 3.69140625, + "logits/rejected": 3.65625, + "logps/chosen": -822.0, + "logps/rejected": -1146.0, + "loss": 0.3424, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.001953125, + "rewards/margins": 6.3671875, + "rewards/rejected": -4.359375, + "step": 1666 + }, + { + "epoch": 0.3307703755146585, + "grad_norm": 39.76654112868164, + "learning_rate": 8.619679369105858e-07, + "logits/chosen": 4.48828125, + "logits/rejected": 4.578125, + "logps/chosen": -660.5, + "logps/rejected": -499.5, + "loss": 0.3371, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.828125, + "rewards/margins": 5.328125, + "rewards/rejected": -3.5, + "step": 1667 + }, + { + "epoch": 0.3309687980554591, + "grad_norm": 28.826389497789325, + "learning_rate": 8.617432489628503e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 3.87109375, + "logps/chosen": -949.0, + "logps/rejected": -581.0, + "loss": 0.3391, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.15234375, + "rewards/margins": 6.328125, + "rewards/rejected": -4.18359375, + "step": 1668 + }, + { + "epoch": 0.33116722059625975, + "grad_norm": 36.89471108673998, + "learning_rate": 8.615184114774157e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.2734375, + "logps/chosen": -1142.0, + "logps/rejected": -1090.5, + "loss": 0.4508, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.25390625, + "rewards/margins": 7.296875, + "rewards/rejected": -4.0390625, + "step": 1669 + }, + { + "epoch": 0.33136564313706035, + "grad_norm": 35.65065072870952, + "learning_rate": 8.612934245621329e-07, + "logits/chosen": 4.5625, + "logits/rejected": 4.609375, + "logps/chosen": -877.0, + "logps/rejected": -837.5, + "loss": 0.4425, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.146484375, + "rewards/margins": 17.078125, + "rewards/rejected": -14.869140625, + "step": 1670 + }, + { + "epoch": 0.331564065677861, + "grad_norm": 31.07914056674625, + "learning_rate": 8.610682883249238e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.3828125, + "logps/chosen": -881.5, + "logps/rejected": -738.5, + "loss": 0.3454, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.1806640625, + "rewards/margins": 6.171875, + "rewards/rejected": -3.984375, + "step": 1671 + }, + { + "epoch": 0.33176248821866167, + "grad_norm": 33.2733520557912, + "learning_rate": 8.608430028737825e-07, + "logits/chosen": 3.734375, + "logits/rejected": 3.96484375, + "logps/chosen": -1158.0, + "logps/rejected": -779.5, + "loss": 0.2905, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9453125, + "rewards/margins": 7.7734375, + "rewards/rejected": -4.82421875, + "step": 1672 + }, + { + "epoch": 0.33196091075946227, + "grad_norm": 29.864491475302216, + "learning_rate": 8.606175683167741e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 4.4140625, + "logps/chosen": -1078.0, + "logps/rejected": -770.0, + "loss": 0.4064, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.818359375, + "rewards/margins": 7.5078125, + "rewards/rejected": -4.69140625, + "step": 1673 + }, + { + "epoch": 0.3321593333002629, + "grad_norm": 32.76885459923962, + "learning_rate": 8.603919847620359e-07, + "logits/chosen": 3.5, + "logits/rejected": 3.53125, + "logps/chosen": -1075.0, + "logps/rejected": -685.0, + "loss": 0.418, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.19921875, + "rewards/margins": 6.23828125, + "rewards/rejected": -4.02734375, + "step": 1674 + }, + { + "epoch": 0.33235775584106353, + "grad_norm": 34.730080027675406, + "learning_rate": 8.601662523177761e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.078125, + "logps/chosen": -741.0, + "logps/rejected": -505.0, + "loss": 0.4598, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.49609375, + "rewards/margins": 4.8828125, + "rewards/rejected": -3.390625, + "step": 1675 + }, + { + "epoch": 0.3325561783818642, + "grad_norm": 34.20323185281203, + "learning_rate": 8.599403710922748e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 3.97265625, + "logps/chosen": -724.5, + "logps/rejected": -597.0, + "loss": 0.5342, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.86328125, + "rewards/margins": 4.015625, + "rewards/rejected": -2.15625, + "step": 1676 + }, + { + "epoch": 0.3327546009226648, + "grad_norm": 27.363728547500397, + "learning_rate": 8.59714341193883e-07, + "logits/chosen": 4.6328125, + "logits/rejected": 4.5078125, + "logps/chosen": -896.0, + "logps/rejected": -797.0, + "loss": 0.3473, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.515625, + "rewards/margins": 7.34375, + "rewards/rejected": -4.8203125, + "step": 1677 + }, + { + "epoch": 0.33295302346346545, + "grad_norm": 33.04761841785724, + "learning_rate": 8.594881627310232e-07, + "logits/chosen": 4.32421875, + "logits/rejected": 4.2734375, + "logps/chosen": -801.0, + "logps/rejected": -514.0, + "loss": 0.5682, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.3857421875, + "rewards/margins": 4.12890625, + "rewards/rejected": -2.7421875, + "step": 1678 + }, + { + "epoch": 0.3331514460042661, + "grad_norm": 37.3453655189831, + "learning_rate": 8.592618358121891e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 3.91796875, + "logps/chosen": -1049.0, + "logps/rejected": -775.0, + "loss": 0.429, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0703125, + "rewards/margins": 7.15625, + "rewards/rejected": -5.08984375, + "step": 1679 + }, + { + "epoch": 0.3333498685450667, + "grad_norm": 27.938747822629075, + "learning_rate": 8.590353605459462e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 4.4140625, + "logps/chosen": -888.0, + "logps/rejected": -851.0, + "loss": 0.5531, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.869140625, + "rewards/margins": 5.66796875, + "rewards/rejected": -3.787109375, + "step": 1680 + }, + { + "epoch": 0.33354829108586737, + "grad_norm": 36.807422750077265, + "learning_rate": 8.588087370409302e-07, + "logits/chosen": 3.66015625, + "logits/rejected": 3.57421875, + "logps/chosen": -851.0, + "logps/rejected": -602.5, + "loss": 0.5079, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.21484375, + "rewards/margins": 5.234375, + "rewards/rejected": -3.0302734375, + "step": 1681 + }, + { + "epoch": 0.33374671362666797, + "grad_norm": 46.251780919042204, + "learning_rate": 8.585819654058484e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 4.2890625, + "logps/chosen": -885.0, + "logps/rejected": -670.5, + "loss": 0.3915, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.75, + "rewards/margins": 6.1171875, + "rewards/rejected": -3.359375, + "step": 1682 + }, + { + "epoch": 0.3339451361674686, + "grad_norm": 43.36810900646483, + "learning_rate": 8.583550457494791e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.01171875, + "logps/chosen": -941.0, + "logps/rejected": -934.0, + "loss": 0.499, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.5927734375, + "rewards/margins": 6.1171875, + "rewards/rejected": -4.5078125, + "step": 1683 + }, + { + "epoch": 0.3341435587082693, + "grad_norm": 39.287292525528066, + "learning_rate": 8.581279781806721e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 3.87890625, + "logps/chosen": -1217.0, + "logps/rejected": -921.0, + "loss": 0.5153, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.529296875, + "rewards/margins": 7.48046875, + "rewards/rejected": -5.9453125, + "step": 1684 + }, + { + "epoch": 0.3343419812490699, + "grad_norm": 33.231584453117726, + "learning_rate": 8.579007628083471e-07, + "logits/chosen": 4.22265625, + "logits/rejected": 4.3828125, + "logps/chosen": -722.0, + "logps/rejected": -650.0, + "loss": 0.4868, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.90625, + "rewards/margins": 5.8984375, + "rewards/rejected": -4.0, + "step": 1685 + }, + { + "epoch": 0.33454040378987054, + "grad_norm": 35.698321307777576, + "learning_rate": 8.576733997414955e-07, + "logits/chosen": 4.32421875, + "logits/rejected": 4.33203125, + "logps/chosen": -670.5, + "logps/rejected": -731.0, + "loss": 0.5199, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.9169921875, + "rewards/margins": 5.26171875, + "rewards/rejected": -3.3330078125, + "step": 1686 + }, + { + "epoch": 0.33473882633067115, + "grad_norm": 31.34904690611201, + "learning_rate": 8.574458890891794e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.078125, + "logps/chosen": -1038.5, + "logps/rejected": -539.5, + "loss": 0.4975, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.0625, + "rewards/margins": 3.810546875, + "rewards/rejected": -2.74609375, + "step": 1687 + }, + { + "epoch": 0.3349372488714718, + "grad_norm": 38.62097188058822, + "learning_rate": 8.572182309605316e-07, + "logits/chosen": 3.953125, + "logits/rejected": 3.83203125, + "logps/chosen": -1101.0, + "logps/rejected": -659.0, + "loss": 0.3163, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.365234375, + "rewards/margins": 7.375, + "rewards/rejected": -4.99609375, + "step": 1688 + }, + { + "epoch": 0.3351356714122724, + "grad_norm": 26.882715639613547, + "learning_rate": 8.569904254647555e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 4.36328125, + "logps/chosen": -1081.0, + "logps/rejected": -1153.0, + "loss": 0.3712, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.51171875, + "rewards/margins": 8.09375, + "rewards/rejected": -5.5859375, + "step": 1689 + }, + { + "epoch": 0.33533409395307306, + "grad_norm": 38.339256568873274, + "learning_rate": 8.567624727111256e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 3.96484375, + "logps/chosen": -1300.0, + "logps/rejected": -998.0, + "loss": 0.3366, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9609375, + "rewards/margins": 9.6484375, + "rewards/rejected": -6.6953125, + "step": 1690 + }, + { + "epoch": 0.3355325164938737, + "grad_norm": 36.61649180315803, + "learning_rate": 8.565343728089866e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.19921875, + "logps/chosen": -881.0, + "logps/rejected": -761.5, + "loss": 0.4903, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.6640625, + "rewards/margins": 7.0078125, + "rewards/rejected": -5.33203125, + "step": 1691 + }, + { + "epoch": 0.3357309390346743, + "grad_norm": 41.95158780527387, + "learning_rate": 8.563061258677539e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 4.12890625, + "logps/chosen": -791.0, + "logps/rejected": -624.5, + "loss": 0.5861, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.44970703125, + "rewards/margins": 4.22265625, + "rewards/rejected": -2.765625, + "step": 1692 + }, + { + "epoch": 0.335929361575475, + "grad_norm": 35.28548962895737, + "learning_rate": 8.560777319969136e-07, + "logits/chosen": 3.6171875, + "logits/rejected": 3.6875, + "logps/chosen": -1049.0, + "logps/rejected": -1197.5, + "loss": 0.4658, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.548828125, + "rewards/margins": 7.84375, + "rewards/rejected": -5.296875, + "step": 1693 + }, + { + "epoch": 0.3361277841162756, + "grad_norm": 36.769322789603684, + "learning_rate": 8.558491913060225e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 3.53125, + "logps/chosen": -940.0, + "logps/rejected": -659.5, + "loss": 0.5284, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.694091796875, + "rewards/margins": 5.53125, + "rewards/rejected": -3.82421875, + "step": 1694 + }, + { + "epoch": 0.33632620665707624, + "grad_norm": 34.5537739057499, + "learning_rate": 8.556205039047068e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.30859375, + "logps/chosen": -790.5, + "logps/rejected": -669.5, + "loss": 0.5004, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.70703125, + "rewards/margins": 6.0625, + "rewards/rejected": -4.361328125, + "step": 1695 + }, + { + "epoch": 0.3365246291978769, + "grad_norm": 25.419689196846104, + "learning_rate": 8.553916699026645e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.359375, + "logps/chosen": -1097.5, + "logps/rejected": -814.25, + "loss": 0.3019, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.34765625, + "rewards/margins": 8.59375, + "rewards/rejected": -6.25, + "step": 1696 + }, + { + "epoch": 0.3367230517386775, + "grad_norm": 34.142008491024974, + "learning_rate": 8.551626894096629e-07, + "logits/chosen": 4.35546875, + "logits/rejected": 4.3125, + "logps/chosen": -802.0, + "logps/rejected": -682.5, + "loss": 0.3662, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.6201171875, + "rewards/margins": 7.0546875, + "rewards/rejected": -5.42578125, + "step": 1697 + }, + { + "epoch": 0.33692147427947816, + "grad_norm": 42.4969780062318, + "learning_rate": 8.549335625355398e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.41015625, + "logps/chosen": -1027.0, + "logps/rejected": -684.0, + "loss": 0.3637, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.18359375, + "rewards/margins": 5.3671875, + "rewards/rejected": -3.18359375, + "step": 1698 + }, + { + "epoch": 0.33711989682027876, + "grad_norm": 30.939759839964342, + "learning_rate": 8.547042893902033e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.390625, + "logps/chosen": -925.0, + "logps/rejected": -558.5, + "loss": 0.3964, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.2578125, + "rewards/margins": 5.48828125, + "rewards/rejected": -3.2265625, + "step": 1699 + }, + { + "epoch": 0.3373183193610794, + "grad_norm": 35.77579882025158, + "learning_rate": 8.544748700836317e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.4453125, + "logps/chosen": -759.5, + "logps/rejected": -573.0, + "loss": 0.5291, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.853515625, + "rewards/margins": 5.3359375, + "rewards/rejected": -3.4833984375, + "step": 1700 + }, + { + "epoch": 0.3375167419018801, + "grad_norm": 31.576924427178824, + "learning_rate": 8.542453047258735e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.40625, + "logps/chosen": -1306.0, + "logps/rejected": -905.0, + "loss": 0.4585, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.4677734375, + "rewards/margins": 7.01171875, + "rewards/rejected": -5.54296875, + "step": 1701 + }, + { + "epoch": 0.3377151644426807, + "grad_norm": 31.85277652771288, + "learning_rate": 8.540155934270471e-07, + "logits/chosen": 3.921875, + "logits/rejected": 4.05859375, + "logps/chosen": -1209.0, + "logps/rejected": -1141.0, + "loss": 0.4012, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.166015625, + "rewards/margins": 10.328125, + "rewards/rejected": -8.1484375, + "step": 1702 + }, + { + "epoch": 0.33791358698348134, + "grad_norm": 41.116121771154866, + "learning_rate": 8.537857362973408e-07, + "logits/chosen": 3.6796875, + "logits/rejected": 4.015625, + "logps/chosen": -885.5, + "logps/rejected": -1270.5, + "loss": 0.4781, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.796875, + "rewards/margins": 7.1171875, + "rewards/rejected": -5.32421875, + "step": 1703 + }, + { + "epoch": 0.33811200952428194, + "grad_norm": 32.91737771550914, + "learning_rate": 8.53555733447013e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 3.80859375, + "logps/chosen": -1124.0, + "logps/rejected": -628.5, + "loss": 0.4315, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.0556640625, + "rewards/margins": 7.046875, + "rewards/rejected": -4.984375, + "step": 1704 + }, + { + "epoch": 0.3383104320650826, + "grad_norm": 44.397371942615464, + "learning_rate": 8.533255849863919e-07, + "logits/chosen": 3.640625, + "logits/rejected": 3.87109375, + "logps/chosen": -666.0, + "logps/rejected": -636.0, + "loss": 0.4959, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.908203125, + "rewards/margins": 5.6328125, + "rewards/rejected": -3.71484375, + "step": 1705 + }, + { + "epoch": 0.3385088546058832, + "grad_norm": 37.14060617979312, + "learning_rate": 8.530952910258757e-07, + "logits/chosen": 4.296875, + "logits/rejected": 3.9765625, + "logps/chosen": -1373.0, + "logps/rejected": -822.0, + "loss": 0.3911, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.833984375, + "rewards/margins": 7.3828125, + "rewards/rejected": -5.5546875, + "step": 1706 + }, + { + "epoch": 0.33870727714668386, + "grad_norm": 30.591380009055943, + "learning_rate": 8.528648516759326e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 4.08203125, + "logps/chosen": -1107.5, + "logps/rejected": -647.0, + "loss": 0.4114, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.91796875, + "rewards/margins": 7.30859375, + "rewards/rejected": -4.390625, + "step": 1707 + }, + { + "epoch": 0.3389056996874845, + "grad_norm": 30.754359115617863, + "learning_rate": 8.526342670470998e-07, + "logits/chosen": 3.6953125, + "logits/rejected": 3.88671875, + "logps/chosen": -908.0, + "logps/rejected": -743.5, + "loss": 0.4408, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.083984375, + "rewards/margins": 7.69921875, + "rewards/rejected": -5.61328125, + "step": 1708 + }, + { + "epoch": 0.3391041222282851, + "grad_norm": 26.65054035809008, + "learning_rate": 8.52403537249985e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 4.26953125, + "logps/chosen": -974.0, + "logps/rejected": -730.5, + "loss": 0.5097, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.671875, + "rewards/margins": 5.734375, + "rewards/rejected": -4.05859375, + "step": 1709 + }, + { + "epoch": 0.3393025447690858, + "grad_norm": 33.198753507585764, + "learning_rate": 8.521726623952651e-07, + "logits/chosen": 4.4375, + "logits/rejected": 4.1953125, + "logps/chosen": -1123.0, + "logps/rejected": -771.5, + "loss": 0.4863, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.22265625, + "rewards/margins": 6.1328125, + "rewards/rejected": -3.91015625, + "step": 1710 + }, + { + "epoch": 0.3395009673098864, + "grad_norm": 36.19847900960038, + "learning_rate": 8.519416425936865e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.30859375, + "logps/chosen": -740.0, + "logps/rejected": -675.0, + "loss": 0.6592, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.1910400390625, + "rewards/margins": 4.298828125, + "rewards/rejected": -3.099609375, + "step": 1711 + }, + { + "epoch": 0.33969938985068704, + "grad_norm": 33.945070274875945, + "learning_rate": 8.517104779560655e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 3.89453125, + "logps/chosen": -976.0, + "logps/rejected": -631.0, + "loss": 0.4867, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.9111328125, + "rewards/margins": 5.60546875, + "rewards/rejected": -3.703125, + "step": 1712 + }, + { + "epoch": 0.3398978123914877, + "grad_norm": 27.586563520544022, + "learning_rate": 8.514791685932877e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 3.78515625, + "logps/chosen": -1171.0, + "logps/rejected": -678.0, + "loss": 0.3459, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.998046875, + "rewards/margins": 6.8828125, + "rewards/rejected": -3.88671875, + "step": 1713 + }, + { + "epoch": 0.3400962349322883, + "grad_norm": 34.86317467408572, + "learning_rate": 8.512477146163078e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.18359375, + "logps/chosen": -1071.0, + "logps/rejected": -688.5, + "loss": 0.4368, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.171875, + "rewards/margins": 6.0, + "rewards/rejected": -3.83203125, + "step": 1714 + }, + { + "epoch": 0.34029465747308896, + "grad_norm": 29.63369113131503, + "learning_rate": 8.510161161361502e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.734375, + "logps/chosen": -1056.0, + "logps/rejected": -937.0, + "loss": 0.4194, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.57421875, + "rewards/margins": 7.6796875, + "rewards/rejected": -5.09375, + "step": 1715 + }, + { + "epoch": 0.34049308001388956, + "grad_norm": 34.98153212052855, + "learning_rate": 8.50784373263909e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 4.296875, + "logps/chosen": -1179.0, + "logps/rejected": -696.5, + "loss": 0.3078, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.669921875, + "rewards/margins": 7.3984375, + "rewards/rejected": -4.7265625, + "step": 1716 + }, + { + "epoch": 0.3406915025546902, + "grad_norm": 34.86011452237173, + "learning_rate": 8.505524861107468e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 3.9453125, + "logps/chosen": -799.5, + "logps/rejected": -542.0, + "loss": 0.4705, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.25390625, + "rewards/margins": 5.0234375, + "rewards/rejected": -2.767578125, + "step": 1717 + }, + { + "epoch": 0.3408899250954909, + "grad_norm": 35.52405127212968, + "learning_rate": 8.503204547878959e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 4.05078125, + "logps/chosen": -1026.0, + "logps/rejected": -712.5, + "loss": 0.4814, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.73828125, + "rewards/margins": 6.5546875, + "rewards/rejected": -4.82421875, + "step": 1718 + }, + { + "epoch": 0.3410883476362915, + "grad_norm": 27.873628002355165, + "learning_rate": 8.500882794066573e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.83984375, + "logps/chosen": -1035.0, + "logps/rejected": -605.0, + "loss": 0.2956, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9453125, + "rewards/margins": 7.546875, + "rewards/rejected": -4.58984375, + "step": 1719 + }, + { + "epoch": 0.34128677017709214, + "grad_norm": 28.583960648671844, + "learning_rate": 8.498559600784018e-07, + "logits/chosen": 4.75, + "logits/rejected": 4.9375, + "logps/chosen": -1140.0, + "logps/rejected": -839.0, + "loss": 0.4472, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.537109375, + "rewards/margins": 6.1484375, + "rewards/rejected": -4.609375, + "step": 1720 + }, + { + "epoch": 0.34148519271789274, + "grad_norm": 35.086298154930375, + "learning_rate": 8.496234969145685e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.15234375, + "logps/chosen": -937.0, + "logps/rejected": -779.0, + "loss": 0.5004, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.03125, + "rewards/margins": 7.1015625, + "rewards/rejected": -5.0703125, + "step": 1721 + }, + { + "epoch": 0.3416836152586934, + "grad_norm": 19.39758251567673, + "learning_rate": 8.493908900266662e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.05078125, + "logps/chosen": -1165.0, + "logps/rejected": -739.0, + "loss": 0.3516, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.46484375, + "rewards/margins": 6.9375, + "rewards/rejected": -4.46875, + "step": 1722 + }, + { + "epoch": 0.341882037799494, + "grad_norm": 32.956000442649035, + "learning_rate": 8.491581395262719e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 3.86328125, + "logps/chosen": -743.5, + "logps/rejected": -575.5, + "loss": 0.4222, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.09375, + "rewards/margins": 7.5, + "rewards/rejected": -5.3984375, + "step": 1723 + }, + { + "epoch": 0.34208046034029466, + "grad_norm": 36.18923852862901, + "learning_rate": 8.489252455250323e-07, + "logits/chosen": 4.453125, + "logits/rejected": 4.40234375, + "logps/chosen": -814.5, + "logps/rejected": -557.0, + "loss": 0.4562, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.271484375, + "rewards/margins": 5.5, + "rewards/rejected": -3.224609375, + "step": 1724 + }, + { + "epoch": 0.3422788828810953, + "grad_norm": 25.383871287802155, + "learning_rate": 8.486922081346623e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 4.31640625, + "logps/chosen": -1074.0, + "logps/rejected": -1012.0, + "loss": 0.406, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.55078125, + "rewards/margins": 9.1953125, + "rewards/rejected": -6.6484375, + "step": 1725 + }, + { + "epoch": 0.3424773054218959, + "grad_norm": 32.291669367161276, + "learning_rate": 8.484590274669458e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 3.95703125, + "logps/chosen": -958.0, + "logps/rejected": -669.0, + "loss": 0.5164, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.951171875, + "rewards/margins": 5.72265625, + "rewards/rejected": -3.7626953125, + "step": 1726 + }, + { + "epoch": 0.3426757279626966, + "grad_norm": 35.40645659029858, + "learning_rate": 8.482257036337354e-07, + "logits/chosen": 4.41015625, + "logits/rejected": 4.44921875, + "logps/chosen": -993.0, + "logps/rejected": -924.0, + "loss": 0.5096, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.10546875, + "rewards/margins": 6.21875, + "rewards/rejected": -4.11328125, + "step": 1727 + }, + { + "epoch": 0.3428741505034972, + "grad_norm": 36.54661774170455, + "learning_rate": 8.479922367469524e-07, + "logits/chosen": 4.703125, + "logits/rejected": 4.6640625, + "logps/chosen": -797.0, + "logps/rejected": -744.0, + "loss": 0.577, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.7060546875, + "rewards/margins": 4.75390625, + "rewards/rejected": -3.0546875, + "step": 1728 + }, + { + "epoch": 0.34307257304429783, + "grad_norm": 34.86784065802358, + "learning_rate": 8.477586269185867e-07, + "logits/chosen": 4.328125, + "logits/rejected": 4.44921875, + "logps/chosen": -1004.5, + "logps/rejected": -701.0, + "loss": 0.4864, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.98046875, + "rewards/margins": 5.75, + "rewards/rejected": -3.775390625, + "step": 1729 + }, + { + "epoch": 0.3432709955850985, + "grad_norm": 28.874594715646424, + "learning_rate": 8.475248742606969e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.28515625, + "logps/chosen": -829.0, + "logps/rejected": -564.0, + "loss": 0.6042, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.1181640625, + "rewards/margins": 4.44140625, + "rewards/rejected": -3.3203125, + "step": 1730 + }, + { + "epoch": 0.3434694181258991, + "grad_norm": 39.69593317984874, + "learning_rate": 8.472909788854099e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 4.3359375, + "logps/chosen": -852.5, + "logps/rejected": -1113.0, + "loss": 0.3635, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.970703125, + "rewards/margins": 6.5546875, + "rewards/rejected": -4.57421875, + "step": 1731 + }, + { + "epoch": 0.34366784066669975, + "grad_norm": 31.81468182546872, + "learning_rate": 8.470569409049213e-07, + "logits/chosen": 4.4453125, + "logits/rejected": 4.46875, + "logps/chosen": -851.5, + "logps/rejected": -608.5, + "loss": 0.4725, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.330078125, + "rewards/margins": 5.71484375, + "rewards/rejected": -3.3818359375, + "step": 1732 + }, + { + "epoch": 0.34386626320750036, + "grad_norm": 43.23449122709111, + "learning_rate": 8.468227604314947e-07, + "logits/chosen": 4.48828125, + "logits/rejected": 4.34375, + "logps/chosen": -1021.0, + "logps/rejected": -671.5, + "loss": 0.5537, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.381591796875, + "rewards/margins": 4.48046875, + "rewards/rejected": -3.099609375, + "step": 1733 + }, + { + "epoch": 0.344064685748301, + "grad_norm": 34.22935724217437, + "learning_rate": 8.465884375774625e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.19921875, + "logps/chosen": -892.0, + "logps/rejected": -621.0, + "loss": 0.4938, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.78125, + "rewards/margins": 5.4296875, + "rewards/rejected": -3.6484375, + "step": 1734 + }, + { + "epoch": 0.3442631082891016, + "grad_norm": 40.20261600974287, + "learning_rate": 8.463539724552253e-07, + "logits/chosen": 3.8515625, + "logits/rejected": 3.9375, + "logps/chosen": -967.0, + "logps/rejected": -1161.0, + "loss": 0.4531, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.931640625, + "rewards/margins": 7.44140625, + "rewards/rejected": -5.515625, + "step": 1735 + }, + { + "epoch": 0.3444615308299023, + "grad_norm": 28.430478787679224, + "learning_rate": 8.461193651772517e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.2421875, + "logps/chosen": -1071.0, + "logps/rejected": -897.0, + "loss": 0.3358, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.57421875, + "rewards/margins": 8.53125, + "rewards/rejected": -5.96484375, + "step": 1736 + }, + { + "epoch": 0.34465995337070293, + "grad_norm": 36.65206852565349, + "learning_rate": 8.458846158560786e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 4.04296875, + "logps/chosen": -867.0, + "logps/rejected": -653.0, + "loss": 0.4782, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.6220703125, + "rewards/margins": 5.60546875, + "rewards/rejected": -3.984375, + "step": 1737 + }, + { + "epoch": 0.34485837591150353, + "grad_norm": 34.00066532375666, + "learning_rate": 8.456497246043112e-07, + "logits/chosen": 4.31640625, + "logits/rejected": 4.25390625, + "logps/chosen": -1486.0, + "logps/rejected": -1318.0, + "loss": 0.5082, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.0, + "rewards/margins": 5.24609375, + "rewards/rejected": -5.236328125, + "step": 1738 + }, + { + "epoch": 0.3450567984523042, + "grad_norm": 30.97270574105151, + "learning_rate": 8.454146915346227e-07, + "logits/chosen": 3.859375, + "logits/rejected": 3.8359375, + "logps/chosen": -743.5, + "logps/rejected": -499.0, + "loss": 0.5039, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.05859375, + "rewards/margins": 5.05859375, + "rewards/rejected": -2.9918212890625, + "step": 1739 + }, + { + "epoch": 0.3452552209931048, + "grad_norm": 23.721605415794368, + "learning_rate": 8.451795167597539e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.3984375, + "logps/chosen": -799.5, + "logps/rejected": -882.5, + "loss": 0.5249, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.279296875, + "rewards/margins": 6.09765625, + "rewards/rejected": -3.8203125, + "step": 1740 + }, + { + "epoch": 0.34545364353390545, + "grad_norm": 40.098181752566994, + "learning_rate": 8.449442003925145e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 3.83984375, + "logps/chosen": -1022.0, + "logps/rejected": -664.0, + "loss": 0.3828, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.5556640625, + "rewards/margins": 6.5078125, + "rewards/rejected": -4.95703125, + "step": 1741 + }, + { + "epoch": 0.3456520660747061, + "grad_norm": 39.74411376984556, + "learning_rate": 8.447087425457814e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.2890625, + "logps/chosen": -974.0, + "logps/rejected": -1792.0, + "loss": 0.4642, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.265625, + "rewards/margins": 8.5390625, + "rewards/rejected": -6.296875, + "step": 1742 + }, + { + "epoch": 0.3458504886155067, + "grad_norm": 40.24805000078307, + "learning_rate": 8.444731433324993e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 3.953125, + "logps/chosen": -892.0, + "logps/rejected": -574.0, + "loss": 0.5127, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.142578125, + "rewards/margins": 5.0546875, + "rewards/rejected": -2.91796875, + "step": 1743 + }, + { + "epoch": 0.34604891115630737, + "grad_norm": 45.81620081756685, + "learning_rate": 8.442374028656814e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 3.84765625, + "logps/chosen": -1172.0, + "logps/rejected": -847.0, + "loss": 0.472, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.071533203125, + "rewards/margins": 7.3515625, + "rewards/rejected": -6.265625, + "step": 1744 + }, + { + "epoch": 0.346247333697108, + "grad_norm": 33.14974571958698, + "learning_rate": 8.440015212584078e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 3.9140625, + "logps/chosen": -762.0, + "logps/rejected": -642.0, + "loss": 0.336, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.6142578125, + "rewards/margins": 5.9140625, + "rewards/rejected": -4.30078125, + "step": 1745 + }, + { + "epoch": 0.34644575623790863, + "grad_norm": 34.45178973455706, + "learning_rate": 8.437654986238268e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 4.01953125, + "logps/chosen": -1081.0, + "logps/rejected": -1074.0, + "loss": 0.4486, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.43359375, + "rewards/margins": 8.1015625, + "rewards/rejected": -5.6875, + "step": 1746 + }, + { + "epoch": 0.3466441787787093, + "grad_norm": 31.64675921911148, + "learning_rate": 8.435293350751544e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.09765625, + "logps/chosen": -988.0, + "logps/rejected": -678.5, + "loss": 0.4135, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.431640625, + "rewards/margins": 6.078125, + "rewards/rejected": -3.65234375, + "step": 1747 + }, + { + "epoch": 0.3468426013195099, + "grad_norm": 34.43959170125551, + "learning_rate": 8.432930307256742e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 3.703125, + "logps/chosen": -819.0, + "logps/rejected": -639.0, + "loss": 0.4793, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.298828125, + "rewards/margins": 5.20703125, + "rewards/rejected": -2.91015625, + "step": 1748 + }, + { + "epoch": 0.34704102386031055, + "grad_norm": 31.26248979453601, + "learning_rate": 8.430565856887368e-07, + "logits/chosen": 3.69140625, + "logits/rejected": 3.89453125, + "logps/chosen": -882.0, + "logps/rejected": -672.0, + "loss": 0.3477, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.080078125, + "rewards/margins": 6.6328125, + "rewards/rejected": -4.556640625, + "step": 1749 + }, + { + "epoch": 0.34723944640111115, + "grad_norm": 29.98369385573268, + "learning_rate": 8.42820000077761e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.07421875, + "logps/chosen": -1017.0, + "logps/rejected": -775.5, + "loss": 0.4528, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.6484375, + "rewards/margins": 6.0, + "rewards/rejected": -4.35546875, + "step": 1750 + }, + { + "epoch": 0.3474378689419118, + "grad_norm": 34.61113786610468, + "learning_rate": 8.425832740062326e-07, + "logits/chosen": 4.203125, + "logits/rejected": 3.97265625, + "logps/chosen": -1321.0, + "logps/rejected": -840.0, + "loss": 0.4059, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.759765625, + "rewards/margins": 8.34375, + "rewards/rejected": -5.5859375, + "step": 1751 + }, + { + "epoch": 0.3476362914827124, + "grad_norm": 30.37223361243773, + "learning_rate": 8.423464075877048e-07, + "logits/chosen": 4.26171875, + "logits/rejected": 4.15625, + "logps/chosen": -1275.0, + "logps/rejected": -834.0, + "loss": 0.4126, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.43896484375, + "rewards/margins": 6.546875, + "rewards/rejected": -4.10546875, + "step": 1752 + }, + { + "epoch": 0.34783471402351307, + "grad_norm": 43.091018015380456, + "learning_rate": 8.421094009357983e-07, + "logits/chosen": 3.34765625, + "logits/rejected": 3.42578125, + "logps/chosen": -857.0, + "logps/rejected": -1066.5, + "loss": 0.5226, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8212890625, + "rewards/margins": 5.90625, + "rewards/rejected": -4.08984375, + "step": 1753 + }, + { + "epoch": 0.3480331365643137, + "grad_norm": 29.970229085163254, + "learning_rate": 8.41872254164201e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 3.85546875, + "logps/chosen": -1180.0, + "logps/rejected": -1370.0, + "loss": 0.3582, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.798828125, + "rewards/margins": 14.484375, + "rewards/rejected": -11.712890625, + "step": 1754 + }, + { + "epoch": 0.34823155910511433, + "grad_norm": 30.471247348553685, + "learning_rate": 8.41634967386668e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.05859375, + "logps/chosen": -645.0, + "logps/rejected": -645.5, + "loss": 0.4469, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.27734375, + "rewards/margins": 6.01171875, + "rewards/rejected": -3.744140625, + "step": 1755 + }, + { + "epoch": 0.348429981645915, + "grad_norm": 32.808599552239095, + "learning_rate": 8.413975407170216e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.2421875, + "logps/chosen": -1221.0, + "logps/rejected": -1164.0, + "loss": 0.3963, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.267578125, + "rewards/margins": 9.77734375, + "rewards/rejected": -6.51953125, + "step": 1756 + }, + { + "epoch": 0.3486284041867156, + "grad_norm": 28.74460016410754, + "learning_rate": 8.411599742691511e-07, + "logits/chosen": 4.3984375, + "logits/rejected": 4.44921875, + "logps/chosen": -942.0, + "logps/rejected": -1682.0, + "loss": 0.4164, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.42578125, + "rewards/margins": 9.484375, + "rewards/rejected": -7.0546875, + "step": 1757 + }, + { + "epoch": 0.34882682672751625, + "grad_norm": 29.600236916963823, + "learning_rate": 8.409222681570129e-07, + "logits/chosen": 4.38671875, + "logits/rejected": 4.34375, + "logps/chosen": -771.5, + "logps/rejected": -569.5, + "loss": 0.4684, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.265625, + "rewards/margins": 7.12109375, + "rewards/rejected": -4.86328125, + "step": 1758 + }, + { + "epoch": 0.3490252492683169, + "grad_norm": 33.598003536586084, + "learning_rate": 8.406844224946304e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.23828125, + "logps/chosen": -840.0, + "logps/rejected": -754.0, + "loss": 0.5396, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.9013671875, + "rewards/margins": 5.21875, + "rewards/rejected": -3.32421875, + "step": 1759 + }, + { + "epoch": 0.3492236718091175, + "grad_norm": 30.858136458567696, + "learning_rate": 8.404464373960939e-07, + "logits/chosen": 4.015625, + "logits/rejected": 4.05859375, + "logps/chosen": -881.0, + "logps/rejected": -1489.0, + "loss": 0.4262, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.2744140625, + "rewards/margins": 7.96875, + "rewards/rejected": -5.69921875, + "step": 1760 + }, + { + "epoch": 0.34942209434991817, + "grad_norm": 34.57793341555886, + "learning_rate": 8.402083129755606e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.28125, + "logps/chosen": -795.0, + "logps/rejected": -603.0, + "loss": 0.3908, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.28125, + "rewards/margins": 6.46875, + "rewards/rejected": -4.1787109375, + "step": 1761 + }, + { + "epoch": 0.34962051689071877, + "grad_norm": 26.054863594143693, + "learning_rate": 8.399700493472548e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 3.984375, + "logps/chosen": -968.0, + "logps/rejected": -1428.5, + "loss": 0.429, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.05029296875, + "rewards/margins": 9.35546875, + "rewards/rejected": -7.31640625, + "step": 1762 + }, + { + "epoch": 0.3498189394315194, + "grad_norm": 29.633772055207643, + "learning_rate": 8.397316466254669e-07, + "logits/chosen": 4.6953125, + "logits/rejected": 4.59375, + "logps/chosen": -971.0, + "logps/rejected": -740.0, + "loss": 0.3897, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.36474609375, + "rewards/margins": 6.2734375, + "rewards/rejected": -4.9140625, + "step": 1763 + }, + { + "epoch": 0.35001736197232003, + "grad_norm": 30.812764808111996, + "learning_rate": 8.394931049245548e-07, + "logits/chosen": 3.70703125, + "logits/rejected": 3.91015625, + "logps/chosen": -846.0, + "logps/rejected": -556.0, + "loss": 0.4091, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.013671875, + "rewards/margins": 6.6640625, + "rewards/rejected": -4.6484375, + "step": 1764 + }, + { + "epoch": 0.3502157845131207, + "grad_norm": 26.28012660989084, + "learning_rate": 8.392544243589427e-07, + "logits/chosen": 3.5078125, + "logits/rejected": 3.6875, + "logps/chosen": -1204.0, + "logps/rejected": -826.0, + "loss": 0.3125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7578125, + "rewards/margins": 8.9140625, + "rewards/rejected": -6.1640625, + "step": 1765 + }, + { + "epoch": 0.35041420705392134, + "grad_norm": 37.154975837864534, + "learning_rate": 8.390156050431213e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 3.90625, + "logps/chosen": -818.0, + "logps/rejected": -824.0, + "loss": 0.4482, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.728515625, + "rewards/margins": 8.1796875, + "rewards/rejected": -6.4609375, + "step": 1766 + }, + { + "epoch": 0.35061262959472195, + "grad_norm": 40.05266525356499, + "learning_rate": 8.38776647091648e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 4.04296875, + "logps/chosen": -1369.0, + "logps/rejected": -863.0, + "loss": 0.3183, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.732421875, + "rewards/margins": 9.6640625, + "rewards/rejected": -6.9296875, + "step": 1767 + }, + { + "epoch": 0.3508110521355226, + "grad_norm": 39.666919353289295, + "learning_rate": 8.385375506191466e-07, + "logits/chosen": 3.96875, + "logits/rejected": 3.89453125, + "logps/chosen": -1031.0, + "logps/rejected": -674.5, + "loss": 0.5542, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.365234375, + "rewards/margins": 5.0390625, + "rewards/rejected": -3.671875, + "step": 1768 + }, + { + "epoch": 0.3510094746763232, + "grad_norm": 44.54111173434403, + "learning_rate": 8.382983157403077e-07, + "logits/chosen": 4.05078125, + "logits/rejected": 3.82421875, + "logps/chosen": -1096.0, + "logps/rejected": -732.0, + "loss": 0.4851, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.828125, + "rewards/margins": 7.109375, + "rewards/rejected": -5.28125, + "step": 1769 + }, + { + "epoch": 0.35120789721712387, + "grad_norm": 37.32435182240089, + "learning_rate": 8.380589425698879e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 3.9609375, + "logps/chosen": -1044.0, + "logps/rejected": -990.5, + "loss": 0.5191, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.53271484375, + "rewards/margins": 6.328125, + "rewards/rejected": -4.798828125, + "step": 1770 + }, + { + "epoch": 0.3514063197579245, + "grad_norm": 32.776117282415775, + "learning_rate": 8.3781943122271e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 4.0078125, + "logps/chosen": -655.0, + "logps/rejected": -524.0, + "loss": 0.4468, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.984375, + "rewards/margins": 6.0546875, + "rewards/rejected": -4.05859375, + "step": 1771 + }, + { + "epoch": 0.3516047422987251, + "grad_norm": 40.354582137929306, + "learning_rate": 8.375797818136636e-07, + "logits/chosen": 3.671875, + "logits/rejected": 3.15625, + "logps/chosen": -835.0, + "logps/rejected": -611.5, + "loss": 0.5784, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.4072265625, + "rewards/margins": 4.91796875, + "rewards/rejected": -3.51171875, + "step": 1772 + }, + { + "epoch": 0.3518031648395258, + "grad_norm": 40.36202759349572, + "learning_rate": 8.373399944577045e-07, + "logits/chosen": 4.015625, + "logits/rejected": 3.98828125, + "logps/chosen": -1520.0, + "logps/rejected": -980.0, + "loss": 0.3713, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.171875, + "rewards/margins": 8.9921875, + "rewards/rejected": -6.828125, + "step": 1773 + }, + { + "epoch": 0.3520015873803264, + "grad_norm": 29.13241016047537, + "learning_rate": 8.371000692698539e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.32421875, + "logps/chosen": -1072.0, + "logps/rejected": -1048.0, + "loss": 0.3285, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.193359375, + "rewards/margins": 9.25, + "rewards/rejected": -7.0625, + "step": 1774 + }, + { + "epoch": 0.35220000992112704, + "grad_norm": 40.52189589331609, + "learning_rate": 8.368600063651999e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.1328125, + "logps/chosen": -788.0, + "logps/rejected": -694.5, + "loss": 0.4601, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.263671875, + "rewards/margins": 5.359375, + "rewards/rejected": -4.10546875, + "step": 1775 + }, + { + "epoch": 0.3523984324619277, + "grad_norm": 33.490223339536804, + "learning_rate": 8.366198058588966e-07, + "logits/chosen": 3.921875, + "logits/rejected": 4.14453125, + "logps/chosen": -1114.0, + "logps/rejected": -765.0, + "loss": 0.4133, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.1531982421875, + "rewards/margins": 7.30859375, + "rewards/rejected": -5.15625, + "step": 1776 + }, + { + "epoch": 0.3525968550027283, + "grad_norm": 36.179232661590284, + "learning_rate": 8.363794678661636e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.23046875, + "logps/chosen": -929.0, + "logps/rejected": -951.0, + "loss": 0.497, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.9306640625, + "rewards/margins": 7.15625, + "rewards/rejected": -5.2265625, + "step": 1777 + }, + { + "epoch": 0.35279527754352896, + "grad_norm": 29.468129634582287, + "learning_rate": 8.361389925022868e-07, + "logits/chosen": 4.49609375, + "logits/rejected": 4.5, + "logps/chosen": -1088.0, + "logps/rejected": -618.5, + "loss": 0.4958, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0029296875, + "rewards/margins": 5.75, + "rewards/rejected": -3.75, + "step": 1778 + }, + { + "epoch": 0.35299370008432956, + "grad_norm": 29.446782578655917, + "learning_rate": 8.358983798826182e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.0390625, + "logps/chosen": -1039.0, + "logps/rejected": -902.5, + "loss": 0.4541, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.55859375, + "rewards/margins": 6.5625, + "rewards/rejected": -4.98828125, + "step": 1779 + }, + { + "epoch": 0.3531921226251302, + "grad_norm": 28.297396771654324, + "learning_rate": 8.356576301225752e-07, + "logits/chosen": 3.72265625, + "logits/rejected": 4.28125, + "logps/chosen": -952.5, + "logps/rejected": -746.0, + "loss": 0.4982, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.41796875, + "rewards/margins": 6.2890625, + "rewards/rejected": -4.859375, + "step": 1780 + }, + { + "epoch": 0.3533905451659308, + "grad_norm": 40.434261859229096, + "learning_rate": 8.354167433376414e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 3.9765625, + "logps/chosen": -1191.0, + "logps/rejected": -704.0, + "loss": 0.4215, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.42578125, + "rewards/margins": 6.4375, + "rewards/rejected": -4.01953125, + "step": 1781 + }, + { + "epoch": 0.3535889677067315, + "grad_norm": 38.29601758006851, + "learning_rate": 8.351757196433656e-07, + "logits/chosen": 3.984375, + "logits/rejected": 3.8359375, + "logps/chosen": -1241.0, + "logps/rejected": -779.0, + "loss": 0.3455, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.419921875, + "rewards/margins": 7.5078125, + "rewards/rejected": -5.09375, + "step": 1782 + }, + { + "epoch": 0.35378739024753214, + "grad_norm": 23.67281219479216, + "learning_rate": 8.349345591553629e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.2265625, + "logps/chosen": -1391.0, + "logps/rejected": -874.0, + "loss": 0.302, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8125, + "rewards/margins": 7.2265625, + "rewards/rejected": -4.41796875, + "step": 1783 + }, + { + "epoch": 0.35398581278833274, + "grad_norm": 37.69737068642854, + "learning_rate": 8.346932619893133e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 3.99609375, + "logps/chosen": -1142.0, + "logps/rejected": -684.0, + "loss": 0.4478, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.05859375, + "rewards/margins": 7.625, + "rewards/rejected": -4.56640625, + "step": 1784 + }, + { + "epoch": 0.3541842353291334, + "grad_norm": 44.924718747629335, + "learning_rate": 8.344518282609633e-07, + "logits/chosen": 3.67578125, + "logits/rejected": 3.6875, + "logps/chosen": -1039.0, + "logps/rejected": -823.5, + "loss": 0.4129, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.92578125, + "rewards/margins": 6.6328125, + "rewards/rejected": -4.69921875, + "step": 1785 + }, + { + "epoch": 0.354382657869934, + "grad_norm": 30.914561894856874, + "learning_rate": 8.342102580861239e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.1328125, + "logps/chosen": -871.0, + "logps/rejected": -672.0, + "loss": 0.5036, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.6923828125, + "rewards/margins": 4.5546875, + "rewards/rejected": -2.861328125, + "step": 1786 + }, + { + "epoch": 0.35458108041073466, + "grad_norm": 37.792620135805905, + "learning_rate": 8.339685515806721e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.6796875, + "logps/chosen": -579.0, + "logps/rejected": -789.5, + "loss": 0.4992, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.252685546875, + "rewards/margins": 5.796875, + "rewards/rejected": -4.54296875, + "step": 1787 + }, + { + "epoch": 0.3547795029515353, + "grad_norm": 35.16958817951789, + "learning_rate": 8.337267088605505e-07, + "logits/chosen": 4.73046875, + "logits/rejected": 4.5546875, + "logps/chosen": -1780.0, + "logps/rejected": -786.0, + "loss": 0.3558, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.5703125, + "rewards/margins": 6.095703125, + "rewards/rejected": -5.515625, + "step": 1788 + }, + { + "epoch": 0.3549779254923359, + "grad_norm": 27.994557849471448, + "learning_rate": 8.334847300417666e-07, + "logits/chosen": 3.79296875, + "logits/rejected": 3.7890625, + "logps/chosen": -1034.0, + "logps/rejected": -787.0, + "loss": 0.3118, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.763671875, + "rewards/margins": 8.5546875, + "rewards/rejected": -5.79296875, + "step": 1789 + }, + { + "epoch": 0.3551763480331366, + "grad_norm": 28.753392503858, + "learning_rate": 8.332426152403933e-07, + "logits/chosen": 4.078125, + "logits/rejected": 4.1015625, + "logps/chosen": -1100.0, + "logps/rejected": -1484.0, + "loss": 0.4618, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.5849609375, + "rewards/margins": 8.3359375, + "rewards/rejected": -6.734375, + "step": 1790 + }, + { + "epoch": 0.3553747705739372, + "grad_norm": 40.595125304552255, + "learning_rate": 8.330003645725686e-07, + "logits/chosen": 4.39453125, + "logits/rejected": 4.48046875, + "logps/chosen": -1038.0, + "logps/rejected": -778.0, + "loss": 0.3859, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.140625, + "rewards/margins": 6.953125, + "rewards/rejected": -4.8203125, + "step": 1791 + }, + { + "epoch": 0.35557319311473784, + "grad_norm": 39.41562413684849, + "learning_rate": 8.327579781544962e-07, + "logits/chosen": 4.484375, + "logits/rejected": 4.33203125, + "logps/chosen": -976.0, + "logps/rejected": -616.0, + "loss": 0.419, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.1328125, + "rewards/margins": 6.015625, + "rewards/rejected": -3.884765625, + "step": 1792 + }, + { + "epoch": 0.35577161565553844, + "grad_norm": 24.301645036028884, + "learning_rate": 8.325154561024443e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.28125, + "logps/chosen": -1531.5, + "logps/rejected": -1063.5, + "loss": 0.3355, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.998046875, + "rewards/margins": 9.65625, + "rewards/rejected": -6.65234375, + "step": 1793 + }, + { + "epoch": 0.3559700381963391, + "grad_norm": 36.501235928664904, + "learning_rate": 8.322727985327465e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 4.125, + "logps/chosen": -1132.0, + "logps/rejected": -812.0, + "loss": 0.334, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.96484375, + "rewards/margins": 8.4609375, + "rewards/rejected": -5.5078125, + "step": 1794 + }, + { + "epoch": 0.35616846073713976, + "grad_norm": 59.8520474969439, + "learning_rate": 8.320300055618013e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.234375, + "logps/chosen": -959.0, + "logps/rejected": -1091.5, + "loss": 0.5369, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.541015625, + "rewards/margins": 6.3125, + "rewards/rejected": -3.76953125, + "step": 1795 + }, + { + "epoch": 0.35636688327794036, + "grad_norm": 27.6154710535454, + "learning_rate": 8.317870773060724e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 4.328125, + "logps/chosen": -1064.0, + "logps/rejected": -609.5, + "loss": 0.4409, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.587890625, + "rewards/margins": 6.5703125, + "rewards/rejected": -4.97265625, + "step": 1796 + }, + { + "epoch": 0.356565305818741, + "grad_norm": 34.410280870506554, + "learning_rate": 8.315440138820878e-07, + "logits/chosen": 3.66015625, + "logits/rejected": 3.66015625, + "logps/chosen": -693.0, + "logps/rejected": -477.5, + "loss": 0.5436, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.822265625, + "rewards/margins": 4.4765625, + "rewards/rejected": -2.654296875, + "step": 1797 + }, + { + "epoch": 0.3567637283595416, + "grad_norm": 29.705629574190482, + "learning_rate": 8.31300815406441e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.1796875, + "logps/chosen": -1057.5, + "logps/rejected": -777.0, + "loss": 0.444, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.83984375, + "rewards/margins": 7.6171875, + "rewards/rejected": -5.76953125, + "step": 1798 + }, + { + "epoch": 0.3569621509003423, + "grad_norm": 45.89305904439121, + "learning_rate": 8.310574819957898e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.42578125, + "logps/chosen": -808.0, + "logps/rejected": -997.5, + "loss": 0.468, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.451171875, + "rewards/margins": 6.625, + "rewards/rejected": -5.16015625, + "step": 1799 + }, + { + "epoch": 0.35716057344114294, + "grad_norm": 36.80322051368704, + "learning_rate": 8.308140137668571e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 4.3125, + "logps/chosen": -755.0, + "logps/rejected": -780.5, + "loss": 0.5296, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.2763671875, + "rewards/margins": 5.669921875, + "rewards/rejected": -4.400390625, + "step": 1800 + }, + { + "epoch": 0.35735899598194354, + "grad_norm": 38.89830823366214, + "learning_rate": 8.305704108364301e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.06640625, + "logps/chosen": -825.0, + "logps/rejected": -620.0, + "loss": 0.5028, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.66796875, + "rewards/margins": 7.1328125, + "rewards/rejected": -5.45703125, + "step": 1801 + }, + { + "epoch": 0.3575574185227442, + "grad_norm": 42.78231409686471, + "learning_rate": 8.303266733213607e-07, + "logits/chosen": 4.14453125, + "logits/rejected": 4.1328125, + "logps/chosen": -984.0, + "logps/rejected": -675.0, + "loss": 0.6235, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.11572265625, + "rewards/margins": 5.12890625, + "rewards/rejected": -4.01171875, + "step": 1802 + }, + { + "epoch": 0.3577558410635448, + "grad_norm": 31.754314437634342, + "learning_rate": 8.300828013385658e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 4.1015625, + "logps/chosen": -1044.5, + "logps/rejected": -970.5, + "loss": 0.3792, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3193359375, + "rewards/margins": 8.84375, + "rewards/rejected": -6.53515625, + "step": 1803 + }, + { + "epoch": 0.35795426360434546, + "grad_norm": 42.12292655123144, + "learning_rate": 8.298387950050263e-07, + "logits/chosen": 3.64453125, + "logits/rejected": 3.32421875, + "logps/chosen": -1682.0, + "logps/rejected": -632.5, + "loss": 0.5477, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.2177734375, + "rewards/margins": 3.78759765625, + "rewards/rejected": -4.0078125, + "step": 1804 + }, + { + "epoch": 0.3581526861451461, + "grad_norm": 35.27073409204333, + "learning_rate": 8.295946544377877e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.21484375, + "logps/chosen": -1024.0, + "logps/rejected": -727.0, + "loss": 0.5445, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.5390625, + "rewards/margins": 5.24609375, + "rewards/rejected": -3.703125, + "step": 1805 + }, + { + "epoch": 0.3583511086859467, + "grad_norm": 40.61386540747612, + "learning_rate": 8.293503797539599e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 4.3046875, + "logps/chosen": -747.5, + "logps/rejected": -716.0, + "loss": 0.6304, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.7890625, + "rewards/margins": 9.17578125, + "rewards/rejected": -7.392578125, + "step": 1806 + }, + { + "epoch": 0.3585495312267474, + "grad_norm": 39.20334012954252, + "learning_rate": 8.291059710707169e-07, + "logits/chosen": 3.5078125, + "logits/rejected": 3.38671875, + "logps/chosen": -1042.0, + "logps/rejected": -669.5, + "loss": 0.4398, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.998046875, + "rewards/margins": 6.3515625, + "rewards/rejected": -4.3515625, + "step": 1807 + }, + { + "epoch": 0.358747953767548, + "grad_norm": 35.67909559598171, + "learning_rate": 8.288614285052977e-07, + "logits/chosen": 3.79296875, + "logits/rejected": 3.80859375, + "logps/chosen": -949.0, + "logps/rejected": -722.0, + "loss": 0.4784, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.3193359375, + "rewards/margins": 5.8359375, + "rewards/rejected": -4.52734375, + "step": 1808 + }, + { + "epoch": 0.35894637630834864, + "grad_norm": 31.969390498424616, + "learning_rate": 8.286167521750046e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.2265625, + "logps/chosen": -834.75, + "logps/rejected": -870.0, + "loss": 0.471, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.681640625, + "rewards/margins": 5.85546875, + "rewards/rejected": -3.177734375, + "step": 1809 + }, + { + "epoch": 0.35914479884914924, + "grad_norm": 26.99811900178963, + "learning_rate": 8.283719421972047e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.03125, + "logps/chosen": -942.0, + "logps/rejected": -786.0, + "loss": 0.393, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.921875, + "rewards/margins": 7.15625, + "rewards/rejected": -4.23828125, + "step": 1810 + }, + { + "epoch": 0.3593432213899499, + "grad_norm": 29.069214739742257, + "learning_rate": 8.281269986893287e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.2890625, + "logps/chosen": -844.0, + "logps/rejected": -644.0, + "loss": 0.4069, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.375, + "rewards/margins": 6.3828125, + "rewards/rejected": -4.0078125, + "step": 1811 + }, + { + "epoch": 0.35954164393075055, + "grad_norm": 30.92554298139324, + "learning_rate": 8.27881921768872e-07, + "logits/chosen": 3.74609375, + "logits/rejected": 4.30078125, + "logps/chosen": -874.0, + "logps/rejected": -1508.0, + "loss": 0.5334, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.951171875, + "rewards/margins": 13.2421875, + "rewards/rejected": -11.298828125, + "step": 1812 + }, + { + "epoch": 0.35974006647155116, + "grad_norm": 36.08565028741397, + "learning_rate": 8.276367115533936e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.3671875, + "logps/chosen": -1128.0, + "logps/rejected": -824.0, + "loss": 0.5172, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.21875, + "rewards/margins": 5.51171875, + "rewards/rejected": -3.296875, + "step": 1813 + }, + { + "epoch": 0.3599384890123518, + "grad_norm": 29.301261231400765, + "learning_rate": 8.273913681605164e-07, + "logits/chosen": 4.5390625, + "logits/rejected": 4.47265625, + "logps/chosen": -1074.0, + "logps/rejected": -620.0, + "loss": 0.4567, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4599609375, + "rewards/margins": 7.234375, + "rewards/rejected": -4.7734375, + "step": 1814 + }, + { + "epoch": 0.3601369115531524, + "grad_norm": 42.07642272377312, + "learning_rate": 8.271458917079272e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.92578125, + "logps/chosen": -902.0, + "logps/rejected": -671.5, + "loss": 0.3872, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.53466796875, + "rewards/margins": 6.234375, + "rewards/rejected": -4.69140625, + "step": 1815 + }, + { + "epoch": 0.3603353340939531, + "grad_norm": 31.661886177700904, + "learning_rate": 8.269002823133768e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 4.0546875, + "logps/chosen": -814.0, + "logps/rejected": -535.5, + "loss": 0.5616, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.9482421875, + "rewards/margins": 5.16796875, + "rewards/rejected": -3.21875, + "step": 1816 + }, + { + "epoch": 0.36053375663475373, + "grad_norm": 33.94088176249816, + "learning_rate": 8.266545400946797e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 4.09765625, + "logps/chosen": -1038.0, + "logps/rejected": -995.0, + "loss": 0.3527, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0078125, + "rewards/margins": 8.4453125, + "rewards/rejected": -5.44140625, + "step": 1817 + }, + { + "epoch": 0.36073217917555433, + "grad_norm": 32.95153248857853, + "learning_rate": 8.26408665169714e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.8828125, + "logps/chosen": -778.5, + "logps/rejected": -774.0, + "loss": 0.4621, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.220703125, + "rewards/margins": 6.90625, + "rewards/rejected": -4.6796875, + "step": 1818 + }, + { + "epoch": 0.360930601716355, + "grad_norm": 31.356009539378746, + "learning_rate": 8.261626576564213e-07, + "logits/chosen": 4.35546875, + "logits/rejected": 4.34765625, + "logps/chosen": -1910.0, + "logps/rejected": -1355.0, + "loss": 0.4864, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.48046875, + "rewards/margins": 5.69921875, + "rewards/rejected": -5.22265625, + "step": 1819 + }, + { + "epoch": 0.3611290242571556, + "grad_norm": 42.08366247473524, + "learning_rate": 8.259165176728075e-07, + "logits/chosen": 3.62890625, + "logits/rejected": 3.6171875, + "logps/chosen": -946.0, + "logps/rejected": -767.0, + "loss": 0.3729, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.791015625, + "rewards/margins": 7.0390625, + "rewards/rejected": -5.24609375, + "step": 1820 + }, + { + "epoch": 0.36132744679795625, + "grad_norm": 35.36034259818287, + "learning_rate": 8.256702453369412e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 3.78515625, + "logps/chosen": -729.5, + "logps/rejected": -1194.5, + "loss": 0.3973, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6552734375, + "rewards/margins": 8.390625, + "rewards/rejected": -6.734375, + "step": 1821 + }, + { + "epoch": 0.36152586933875686, + "grad_norm": 35.71890185878889, + "learning_rate": 8.254238407669552e-07, + "logits/chosen": 3.7109375, + "logits/rejected": 3.3125, + "logps/chosen": -946.0, + "logps/rejected": -661.0, + "loss": 0.3838, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.875, + "rewards/margins": 7.4921875, + "rewards/rejected": -5.625, + "step": 1822 + }, + { + "epoch": 0.3617242918795575, + "grad_norm": 32.3174942886865, + "learning_rate": 8.251773040810452e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 3.94140625, + "logps/chosen": -1046.0, + "logps/rejected": -700.0, + "loss": 0.4637, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4951171875, + "rewards/margins": 6.6328125, + "rewards/rejected": -5.1328125, + "step": 1823 + }, + { + "epoch": 0.36192271442035817, + "grad_norm": 35.38524393815555, + "learning_rate": 8.249306353974703e-07, + "logits/chosen": 4.265625, + "logits/rejected": 3.8515625, + "logps/chosen": -1110.0, + "logps/rejected": -722.0, + "loss": 0.4175, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.48388671875, + "rewards/margins": 7.99609375, + "rewards/rejected": -5.49609375, + "step": 1824 + }, + { + "epoch": 0.3621211369611588, + "grad_norm": 37.02083634558067, + "learning_rate": 8.246838348345534e-07, + "logits/chosen": 3.5234375, + "logits/rejected": 3.71875, + "logps/chosen": -742.5, + "logps/rejected": -532.5, + "loss": 0.5871, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.0615234375, + "rewards/margins": 4.1796875, + "rewards/rejected": -3.12109375, + "step": 1825 + }, + { + "epoch": 0.36231955950195943, + "grad_norm": 33.59124228309619, + "learning_rate": 8.244369025106803e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.20703125, + "logps/chosen": -907.0, + "logps/rejected": -644.0, + "loss": 0.4318, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.681640625, + "rewards/margins": 6.5234375, + "rewards/rejected": -4.84765625, + "step": 1826 + }, + { + "epoch": 0.36251798204276003, + "grad_norm": 32.472443514791955, + "learning_rate": 8.241898385442997e-07, + "logits/chosen": 4.4765625, + "logits/rejected": 4.31640625, + "logps/chosen": -1080.0, + "logps/rejected": -674.5, + "loss": 0.4982, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.81549072265625, + "rewards/margins": 6.29296875, + "rewards/rejected": -4.4765625, + "step": 1827 + }, + { + "epoch": 0.3627164045835607, + "grad_norm": 39.932418912450785, + "learning_rate": 8.239426430539243e-07, + "logits/chosen": 3.921875, + "logits/rejected": 3.890625, + "logps/chosen": -798.0, + "logps/rejected": -735.75, + "loss": 0.4882, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.3232421875, + "rewards/margins": 4.91796875, + "rewards/rejected": -3.58984375, + "step": 1828 + }, + { + "epoch": 0.36291482712436135, + "grad_norm": 33.784145244079575, + "learning_rate": 8.236953161581291e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 3.90234375, + "logps/chosen": -985.0, + "logps/rejected": -954.5, + "loss": 0.3983, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.568359375, + "rewards/margins": 6.796875, + "rewards/rejected": -5.2109375, + "step": 1829 + }, + { + "epoch": 0.36311324966516195, + "grad_norm": 43.81691716676718, + "learning_rate": 8.234478579755524e-07, + "logits/chosen": 3.875, + "logits/rejected": 4.046875, + "logps/chosen": -820.0, + "logps/rejected": -652.5, + "loss": 0.5509, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.9140625, + "rewards/margins": 4.8203125, + "rewards/rejected": -2.908203125, + "step": 1830 + }, + { + "epoch": 0.3633116722059626, + "grad_norm": 40.81960237461784, + "learning_rate": 8.232002686248955e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.62890625, + "logps/chosen": -982.0, + "logps/rejected": -1623.0, + "loss": 0.4195, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.078125, + "rewards/margins": 11.3125, + "rewards/rejected": -9.2421875, + "step": 1831 + }, + { + "epoch": 0.3635100947467632, + "grad_norm": 30.528595608714213, + "learning_rate": 8.229525482249227e-07, + "logits/chosen": 3.57421875, + "logits/rejected": 3.56640625, + "logps/chosen": -765.0, + "logps/rejected": -612.0, + "loss": 0.4501, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.095703125, + "rewards/margins": 5.4765625, + "rewards/rejected": -3.375, + "step": 1832 + }, + { + "epoch": 0.36370851728756387, + "grad_norm": 34.35491072554585, + "learning_rate": 8.227046968944614e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.37890625, + "logps/chosen": -884.5, + "logps/rejected": -806.0, + "loss": 0.5915, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.81591796875, + "rewards/margins": 6.00390625, + "rewards/rejected": -4.169921875, + "step": 1833 + }, + { + "epoch": 0.36390693982836453, + "grad_norm": 33.326204137421236, + "learning_rate": 8.224567147524005e-07, + "logits/chosen": 4.39453125, + "logits/rejected": 4.24609375, + "logps/chosen": -624.25, + "logps/rejected": -711.5, + "loss": 0.6121, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.101318359375, + "rewards/margins": 14.48828125, + "rewards/rejected": -13.37109375, + "step": 1834 + }, + { + "epoch": 0.36410536236916513, + "grad_norm": 41.443892717046545, + "learning_rate": 8.222086019176937e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 3.98046875, + "logps/chosen": -844.0, + "logps/rejected": -626.5, + "loss": 0.4885, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.45849609375, + "rewards/margins": 6.625, + "rewards/rejected": -5.15625, + "step": 1835 + }, + { + "epoch": 0.3643037849099658, + "grad_norm": 28.4956291364985, + "learning_rate": 8.219603585093558e-07, + "logits/chosen": 3.8125, + "logits/rejected": 4.0078125, + "logps/chosen": -911.5, + "logps/rejected": -2249.0, + "loss": 0.4107, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.890625, + "rewards/margins": 10.2421875, + "rewards/rejected": -8.375, + "step": 1836 + }, + { + "epoch": 0.3645022074507664, + "grad_norm": 34.110033465871545, + "learning_rate": 8.217119846464647e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.109375, + "logps/chosen": -934.0, + "logps/rejected": -888.25, + "loss": 0.5251, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.783203125, + "rewards/margins": 4.625, + "rewards/rejected": -2.8388671875, + "step": 1837 + }, + { + "epoch": 0.36470062999156705, + "grad_norm": 33.671034606308964, + "learning_rate": 8.214634804481613e-07, + "logits/chosen": 3.640625, + "logits/rejected": 3.91796875, + "logps/chosen": -885.0, + "logps/rejected": -820.0, + "loss": 0.4003, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.73046875, + "rewards/margins": 8.125, + "rewards/rejected": -5.37890625, + "step": 1838 + }, + { + "epoch": 0.36489905253236765, + "grad_norm": 28.69450947776722, + "learning_rate": 8.212148460336483e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 4.26953125, + "logps/chosen": -868.0, + "logps/rejected": -714.0, + "loss": 0.4968, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.12109375, + "rewards/margins": 7.3828125, + "rewards/rejected": -5.2421875, + "step": 1839 + }, + { + "epoch": 0.3650974750731683, + "grad_norm": 34.05154093843934, + "learning_rate": 8.209660815221912e-07, + "logits/chosen": 3.796875, + "logits/rejected": 3.8984375, + "logps/chosen": -1076.0, + "logps/rejected": -826.0, + "loss": 0.4264, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4501953125, + "rewards/margins": 6.48828125, + "rewards/rejected": -4.03515625, + "step": 1840 + }, + { + "epoch": 0.36529589761396897, + "grad_norm": 27.274722214885163, + "learning_rate": 8.207171870331181e-07, + "logits/chosen": 3.55859375, + "logits/rejected": 3.4609375, + "logps/chosen": -986.0, + "logps/rejected": -633.0, + "loss": 0.4189, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.3671875, + "rewards/margins": 6.76171875, + "rewards/rejected": -3.38671875, + "step": 1841 + }, + { + "epoch": 0.36549432015476957, + "grad_norm": 33.97562140817503, + "learning_rate": 8.204681626858193e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.8046875, + "logps/chosen": -1251.0, + "logps/rejected": -875.0, + "loss": 0.2969, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8515625, + "rewards/margins": 8.515625, + "rewards/rejected": -5.671875, + "step": 1842 + }, + { + "epoch": 0.3656927426955702, + "grad_norm": 38.36504182192184, + "learning_rate": 8.202190085997471e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.8359375, + "logps/chosen": -1310.0, + "logps/rejected": -874.0, + "loss": 0.3008, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.51171875, + "rewards/margins": 8.15625, + "rewards/rejected": -4.6328125, + "step": 1843 + }, + { + "epoch": 0.36589116523637083, + "grad_norm": 37.08029429283967, + "learning_rate": 8.199697248944167e-07, + "logits/chosen": 3.83984375, + "logits/rejected": 3.828125, + "logps/chosen": -740.0, + "logps/rejected": -726.0, + "loss": 0.5508, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.6923828125, + "rewards/margins": 5.8515625, + "rewards/rejected": -4.150390625, + "step": 1844 + }, + { + "epoch": 0.3660895877771715, + "grad_norm": 36.02227695087581, + "learning_rate": 8.197203116894045e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 3.8046875, + "logps/chosen": -956.0, + "logps/rejected": -921.0, + "loss": 0.4225, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.63671875, + "rewards/margins": 7.0546875, + "rewards/rejected": -5.4140625, + "step": 1845 + }, + { + "epoch": 0.36628801031797215, + "grad_norm": 45.65193134148537, + "learning_rate": 8.194707691043501e-07, + "logits/chosen": 4.3828125, + "logits/rejected": 4.40625, + "logps/chosen": -1058.0, + "logps/rejected": -875.5, + "loss": 0.3793, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.716796875, + "rewards/margins": 6.65625, + "rewards/rejected": -3.9375, + "step": 1846 + }, + { + "epoch": 0.36648643285877275, + "grad_norm": 35.27030894707657, + "learning_rate": 8.192210972589544e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.86328125, + "logps/chosen": -1264.0, + "logps/rejected": -805.0, + "loss": 0.3991, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7216796875, + "rewards/margins": 8.359375, + "rewards/rejected": -5.640625, + "step": 1847 + }, + { + "epoch": 0.3666848553995734, + "grad_norm": 32.631010900135564, + "learning_rate": 8.189712962729806e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 3.8828125, + "logps/chosen": -1246.0, + "logps/rejected": -711.0, + "loss": 0.4492, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.142578125, + "rewards/margins": 6.33984375, + "rewards/rejected": -4.19140625, + "step": 1848 + }, + { + "epoch": 0.366883277940374, + "grad_norm": 35.056740866632666, + "learning_rate": 8.187213662662538e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.01171875, + "logps/chosen": -1196.0, + "logps/rejected": -837.0, + "loss": 0.405, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.1796875, + "rewards/margins": 8.25, + "rewards/rejected": -6.078125, + "step": 1849 + }, + { + "epoch": 0.36708170048117467, + "grad_norm": 35.73797392726354, + "learning_rate": 8.184713073586608e-07, + "logits/chosen": 4.546875, + "logits/rejected": 4.44921875, + "logps/chosen": -1155.0, + "logps/rejected": -1085.0, + "loss": 0.3191, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.69140625, + "rewards/margins": 9.4375, + "rewards/rejected": -6.73828125, + "step": 1850 + }, + { + "epoch": 0.36728012302197527, + "grad_norm": 30.319598394204583, + "learning_rate": 8.182211196701509e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 3.47265625, + "logps/chosen": -689.5, + "logps/rejected": -401.75, + "loss": 0.5723, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.1611328125, + "rewards/margins": 4.21875, + "rewards/rejected": -3.05859375, + "step": 1851 + }, + { + "epoch": 0.3674785455627759, + "grad_norm": 36.99387133749374, + "learning_rate": 8.179708033207342e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.484375, + "logps/chosen": -849.0, + "logps/rejected": -660.5, + "loss": 0.526, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9765625, + "rewards/margins": 4.5, + "rewards/rejected": -3.52734375, + "step": 1852 + }, + { + "epoch": 0.3676769681035766, + "grad_norm": 28.272931248550513, + "learning_rate": 8.177203584304832e-07, + "logits/chosen": 4.4140625, + "logits/rejected": 4.3359375, + "logps/chosen": -1130.0, + "logps/rejected": -689.5, + "loss": 0.3624, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.298828125, + "rewards/margins": 7.5859375, + "rewards/rejected": -5.28125, + "step": 1853 + }, + { + "epoch": 0.3678753906443772, + "grad_norm": 40.34148479042512, + "learning_rate": 8.174697851195318e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 4.09375, + "logps/chosen": -1046.0, + "logps/rejected": -708.0, + "loss": 0.2933, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.76171875, + "rewards/margins": 7.21875, + "rewards/rejected": -4.43359375, + "step": 1854 + }, + { + "epoch": 0.36807381318517784, + "grad_norm": 36.4694064168779, + "learning_rate": 8.172190835080756e-07, + "logits/chosen": 3.8125, + "logits/rejected": 4.03125, + "logps/chosen": -531.5, + "logps/rejected": -661.5, + "loss": 0.5344, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.359375, + "rewards/margins": 6.75, + "rewards/rejected": -5.37890625, + "step": 1855 + }, + { + "epoch": 0.36827223572597845, + "grad_norm": 29.13832160507485, + "learning_rate": 8.169682537163719e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 3.7734375, + "logps/chosen": -819.0, + "logps/rejected": -686.5, + "loss": 0.3552, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.8203125, + "rewards/margins": 7.0078125, + "rewards/rejected": -5.1875, + "step": 1856 + }, + { + "epoch": 0.3684706582667791, + "grad_norm": 42.5857038867225, + "learning_rate": 8.16717295864739e-07, + "logits/chosen": 4.0625, + "logits/rejected": 3.94921875, + "logps/chosen": -1247.5, + "logps/rejected": -1658.0, + "loss": 0.4359, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.9169921875, + "rewards/margins": 9.40625, + "rewards/rejected": -7.4921875, + "step": 1857 + }, + { + "epoch": 0.36866908080757976, + "grad_norm": 38.42768234752382, + "learning_rate": 8.164662100735572e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 3.8671875, + "logps/chosen": -942.0, + "logps/rejected": -542.0, + "loss": 0.523, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.6558837890625, + "rewards/margins": 5.7421875, + "rewards/rejected": -4.0703125, + "step": 1858 + }, + { + "epoch": 0.36886750334838037, + "grad_norm": 34.856079090793095, + "learning_rate": 8.162149964632675e-07, + "logits/chosen": 4.34765625, + "logits/rejected": 4.30078125, + "logps/chosen": -914.0, + "logps/rejected": -634.0, + "loss": 0.3489, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5078125, + "rewards/margins": 7.875, + "rewards/rejected": -5.37109375, + "step": 1859 + }, + { + "epoch": 0.369065925889181, + "grad_norm": 30.14038686214864, + "learning_rate": 8.159636551543729e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 3.890625, + "logps/chosen": -989.0, + "logps/rejected": -534.5, + "loss": 0.566, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.65234375, + "rewards/margins": 3.7421875, + "rewards/rejected": -2.09375, + "step": 1860 + }, + { + "epoch": 0.3692643484299816, + "grad_norm": 29.382908945349644, + "learning_rate": 8.157121862674373e-07, + "logits/chosen": 4.03125, + "logits/rejected": 3.80859375, + "logps/chosen": -1320.0, + "logps/rejected": -809.0, + "loss": 0.3556, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.837890625, + "rewards/margins": 7.8671875, + "rewards/rejected": -5.02734375, + "step": 1861 + }, + { + "epoch": 0.3694627709707823, + "grad_norm": 29.20725443097703, + "learning_rate": 8.15460589923086e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 4.17578125, + "logps/chosen": -828.0, + "logps/rejected": -830.0, + "loss": 0.4379, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.9677734375, + "rewards/margins": 6.265625, + "rewards/rejected": -4.298828125, + "step": 1862 + }, + { + "epoch": 0.36966119351158294, + "grad_norm": 37.4304639400816, + "learning_rate": 8.152088662420052e-07, + "logits/chosen": 4.5078125, + "logits/rejected": 4.37109375, + "logps/chosen": -927.0, + "logps/rejected": -729.0, + "loss": 0.5867, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.4375, + "rewards/margins": 5.853515625, + "rewards/rejected": -4.407867431640625, + "step": 1863 + }, + { + "epoch": 0.36985961605238354, + "grad_norm": 34.434084675874345, + "learning_rate": 8.149570153449421e-07, + "logits/chosen": 4.59375, + "logits/rejected": 4.55078125, + "logps/chosen": -1057.0, + "logps/rejected": -940.0, + "loss": 0.4209, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.365234375, + "rewards/margins": 8.2734375, + "rewards/rejected": -5.9140625, + "step": 1864 + }, + { + "epoch": 0.3700580385931842, + "grad_norm": 29.219564242487785, + "learning_rate": 8.147050373527052e-07, + "logits/chosen": 4.0625, + "logits/rejected": 3.9765625, + "logps/chosen": -933.0, + "logps/rejected": -777.0, + "loss": 0.3754, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4296875, + "rewards/margins": 7.640625, + "rewards/rejected": -5.19921875, + "step": 1865 + }, + { + "epoch": 0.3702564611339848, + "grad_norm": 30.556938428625486, + "learning_rate": 8.144529323861641e-07, + "logits/chosen": 3.68359375, + "logits/rejected": 3.6875, + "logps/chosen": -1197.0, + "logps/rejected": -708.0, + "loss": 0.4273, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.29296875, + "rewards/margins": 6.859375, + "rewards/rejected": -4.5625, + "step": 1866 + }, + { + "epoch": 0.37045488367478546, + "grad_norm": 22.531740960921447, + "learning_rate": 8.142007005662488e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.19921875, + "logps/chosen": -852.0, + "logps/rejected": -583.0, + "loss": 0.5581, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.67578125, + "rewards/margins": 5.01171875, + "rewards/rejected": -3.33642578125, + "step": 1867 + }, + { + "epoch": 0.37065330621558606, + "grad_norm": 30.959302241013578, + "learning_rate": 8.139483420139504e-07, + "logits/chosen": 4.36328125, + "logits/rejected": 3.65234375, + "logps/chosen": -814.0, + "logps/rejected": -1082.5, + "loss": 0.478, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2705078125, + "rewards/margins": 5.5390625, + "rewards/rejected": -4.267578125, + "step": 1868 + }, + { + "epoch": 0.3708517287563867, + "grad_norm": 37.46371611947812, + "learning_rate": 8.13695856850321e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 3.67578125, + "logps/chosen": -923.0, + "logps/rejected": -579.5, + "loss": 0.459, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.7451171875, + "rewards/margins": 5.78125, + "rewards/rejected": -4.03515625, + "step": 1869 + }, + { + "epoch": 0.3710501512971874, + "grad_norm": 33.089474408906675, + "learning_rate": 8.134432451964732e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 4.0, + "logps/chosen": -1394.0, + "logps/rejected": -910.0, + "loss": 0.2963, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.923828125, + "rewards/margins": 8.23046875, + "rewards/rejected": -5.3046875, + "step": 1870 + }, + { + "epoch": 0.371248573837988, + "grad_norm": 31.381164964277417, + "learning_rate": 8.131905071735803e-07, + "logits/chosen": 3.7109375, + "logits/rejected": 4.01171875, + "logps/chosen": -788.0, + "logps/rejected": -738.0, + "loss": 0.448, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.5029296875, + "rewards/margins": 6.3203125, + "rewards/rejected": -4.828125, + "step": 1871 + }, + { + "epoch": 0.37144699637878864, + "grad_norm": 35.085816793094786, + "learning_rate": 8.129376429028763e-07, + "logits/chosen": 3.91015625, + "logits/rejected": 4.0390625, + "logps/chosen": -644.5, + "logps/rejected": -667.25, + "loss": 0.6587, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.0672607421875, + "rewards/margins": 4.53125, + "rewards/rejected": -3.46875, + "step": 1872 + }, + { + "epoch": 0.37164541891958924, + "grad_norm": 30.54377400429775, + "learning_rate": 8.126846525056555e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 4.0234375, + "logps/chosen": -974.0, + "logps/rejected": -603.0, + "loss": 0.3831, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.03466796875, + "rewards/margins": 6.46484375, + "rewards/rejected": -4.4296875, + "step": 1873 + }, + { + "epoch": 0.3718438414603899, + "grad_norm": 32.26382538440805, + "learning_rate": 8.12431536103273e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.04296875, + "logps/chosen": -797.0, + "logps/rejected": -705.0, + "loss": 0.424, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.056640625, + "rewards/margins": 7.25, + "rewards/rejected": -5.203125, + "step": 1874 + }, + { + "epoch": 0.37204226400119056, + "grad_norm": 29.801667498811472, + "learning_rate": 8.121782938171443e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.66796875, + "logps/chosen": -722.5, + "logps/rejected": -473.5, + "loss": 0.3958, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.5966796875, + "rewards/margins": 5.6875, + "rewards/rejected": -4.08203125, + "step": 1875 + }, + { + "epoch": 0.37224068654199116, + "grad_norm": 29.94836571705892, + "learning_rate": 8.119249257687453e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.2890625, + "logps/chosen": -589.0, + "logps/rejected": -565.5, + "loss": 0.383, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.998046875, + "rewards/margins": 7.0078125, + "rewards/rejected": -5.0234375, + "step": 1876 + }, + { + "epoch": 0.3724391090827918, + "grad_norm": 39.03344061588027, + "learning_rate": 8.11671432079612e-07, + "logits/chosen": 3.984375, + "logits/rejected": 3.828125, + "logps/chosen": -1731.0, + "logps/rejected": -731.0, + "loss": 0.5363, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.52001953125, + "rewards/margins": 4.603515625, + "rewards/rejected": -4.087890625, + "step": 1877 + }, + { + "epoch": 0.3726375316235924, + "grad_norm": 38.02915433435262, + "learning_rate": 8.114178128713408e-07, + "logits/chosen": 3.4140625, + "logits/rejected": 3.2890625, + "logps/chosen": -1248.5, + "logps/rejected": -1028.5, + "loss": 0.4687, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.12890625, + "rewards/margins": 6.0546875, + "rewards/rejected": -3.923828125, + "step": 1878 + }, + { + "epoch": 0.3728359541643931, + "grad_norm": 30.552482430818298, + "learning_rate": 8.111640682655885e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.49609375, + "logps/chosen": -791.0, + "logps/rejected": -588.5, + "loss": 0.3846, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.6845703125, + "rewards/margins": 5.6171875, + "rewards/rejected": -3.921875, + "step": 1879 + }, + { + "epoch": 0.37303437670519374, + "grad_norm": 33.44230267095474, + "learning_rate": 8.109101983840718e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 3.94921875, + "logps/chosen": -1134.0, + "logps/rejected": -1071.0, + "loss": 0.3635, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.021484375, + "rewards/margins": 8.3125, + "rewards/rejected": -6.29296875, + "step": 1880 + }, + { + "epoch": 0.37323279924599434, + "grad_norm": 36.24493578005387, + "learning_rate": 8.106562033485676e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 3.79296875, + "logps/chosen": -1188.0, + "logps/rejected": -684.0, + "loss": 0.5075, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.31640625, + "rewards/margins": 6.2421875, + "rewards/rejected": -3.9296875, + "step": 1881 + }, + { + "epoch": 0.373431221786795, + "grad_norm": 39.7805929406413, + "learning_rate": 8.104020832809126e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 4.046875, + "logps/chosen": -1077.0, + "logps/rejected": -889.0, + "loss": 0.471, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.58837890625, + "rewards/margins": 5.90625, + "rewards/rejected": -4.33203125, + "step": 1882 + }, + { + "epoch": 0.3736296443275956, + "grad_norm": 28.075169322327667, + "learning_rate": 8.101478383030038e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.7734375, + "logps/chosen": -1155.0, + "logps/rejected": -596.5, + "loss": 0.3788, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.32421875, + "rewards/margins": 6.1640625, + "rewards/rejected": -5.8515625, + "step": 1883 + }, + { + "epoch": 0.37382806686839626, + "grad_norm": 26.668281033603307, + "learning_rate": 8.098934685367982e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.359375, + "logps/chosen": -1097.0, + "logps/rejected": -623.0, + "loss": 0.2685, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8671875, + "rewards/margins": 8.1484375, + "rewards/rejected": -5.29296875, + "step": 1884 + }, + { + "epoch": 0.37402648940919686, + "grad_norm": 42.26310586359825, + "learning_rate": 8.096389741043123e-07, + "logits/chosen": 3.625, + "logits/rejected": 3.9765625, + "logps/chosen": -1092.0, + "logps/rejected": -989.0, + "loss": 0.4767, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.05078125, + "rewards/margins": 7.6640625, + "rewards/rejected": -5.625, + "step": 1885 + }, + { + "epoch": 0.3742249119499975, + "grad_norm": 25.539391132569662, + "learning_rate": 8.093843551276225e-07, + "logits/chosen": 3.6640625, + "logits/rejected": 3.7890625, + "logps/chosen": -1360.0, + "logps/rejected": -1108.0, + "loss": 0.3405, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.75390625, + "rewards/margins": 14.2734375, + "rewards/rejected": -10.5234375, + "step": 1886 + }, + { + "epoch": 0.3744233344907982, + "grad_norm": 31.51426347849826, + "learning_rate": 8.09129611728865e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.2265625, + "logps/chosen": -1344.0, + "logps/rejected": -1141.0, + "loss": 0.3787, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.79296875, + "rewards/margins": 9.421875, + "rewards/rejected": -6.64453125, + "step": 1887 + }, + { + "epoch": 0.3746217570315988, + "grad_norm": 38.682124051860306, + "learning_rate": 8.08874744030236e-07, + "logits/chosen": 3.875, + "logits/rejected": 4.1328125, + "logps/chosen": -612.75, + "logps/rejected": -664.0, + "loss": 0.6068, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.42822265625, + "rewards/margins": 3.892578125, + "rewards/rejected": -2.45947265625, + "step": 1888 + }, + { + "epoch": 0.37482017957239944, + "grad_norm": 36.98255326561247, + "learning_rate": 8.086197521539904e-07, + "logits/chosen": 3.9375, + "logits/rejected": 3.56640625, + "logps/chosen": -903.0, + "logps/rejected": -532.0, + "loss": 0.4537, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.021484375, + "rewards/margins": 6.33203125, + "rewards/rejected": -4.310546875, + "step": 1889 + }, + { + "epoch": 0.37501860211320004, + "grad_norm": 29.122703309911603, + "learning_rate": 8.083646362224439e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.02734375, + "logps/chosen": -905.0, + "logps/rejected": -862.5, + "loss": 0.4323, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.96484375, + "rewards/margins": 8.5625, + "rewards/rejected": -6.609375, + "step": 1890 + }, + { + "epoch": 0.3752170246540007, + "grad_norm": 34.17557299565686, + "learning_rate": 8.081093963579707e-07, + "logits/chosen": 3.96875, + "logits/rejected": 4.265625, + "logps/chosen": -861.0, + "logps/rejected": -747.0, + "loss": 0.5123, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.5068359375, + "rewards/margins": 5.1474609375, + "rewards/rejected": -3.642578125, + "step": 1891 + }, + { + "epoch": 0.37541544719480135, + "grad_norm": 30.96323988985516, + "learning_rate": 8.078540326830048e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 4.40625, + "logps/chosen": -859.0, + "logps/rejected": -851.5, + "loss": 0.5105, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8515625, + "rewards/margins": 5.44140625, + "rewards/rejected": -3.58203125, + "step": 1892 + }, + { + "epoch": 0.37561386973560196, + "grad_norm": 37.017415137430234, + "learning_rate": 8.0759854532004e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.31640625, + "logps/chosen": -931.0, + "logps/rejected": -658.5, + "loss": 0.3404, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.2783203125, + "rewards/margins": 7.5, + "rewards/rejected": -5.21875, + "step": 1893 + }, + { + "epoch": 0.3758122922764026, + "grad_norm": 40.13233322582291, + "learning_rate": 8.073429343916287e-07, + "logits/chosen": 3.62890625, + "logits/rejected": 3.6875, + "logps/chosen": -1004.0, + "logps/rejected": -744.0, + "loss": 0.4611, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.216796875, + "rewards/margins": 6.453125, + "rewards/rejected": -4.2265625, + "step": 1894 + }, + { + "epoch": 0.3760107148172032, + "grad_norm": 30.850213115396457, + "learning_rate": 8.070872000203831e-07, + "logits/chosen": 4.328125, + "logits/rejected": 4.234375, + "logps/chosen": -916.0, + "logps/rejected": -630.0, + "loss": 0.4032, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.22265625, + "rewards/margins": 6.5390625, + "rewards/rejected": -4.3125, + "step": 1895 + }, + { + "epoch": 0.3762091373580039, + "grad_norm": 41.50961281768141, + "learning_rate": 8.068313423289742e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 3.94140625, + "logps/chosen": -895.0, + "logps/rejected": -927.0, + "loss": 0.5355, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.8701171875, + "rewards/margins": 15.18359375, + "rewards/rejected": -13.2890625, + "step": 1896 + }, + { + "epoch": 0.3764075598988045, + "grad_norm": 30.719351526710362, + "learning_rate": 8.065753614401328e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 4.3046875, + "logps/chosen": -786.0, + "logps/rejected": -727.0, + "loss": 0.5439, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.775390625, + "rewards/margins": 4.796875, + "rewards/rejected": -3.015625, + "step": 1897 + }, + { + "epoch": 0.37660598243960514, + "grad_norm": 23.847343729290582, + "learning_rate": 8.063192574766479e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 4.203125, + "logps/chosen": -1035.0, + "logps/rejected": -817.0, + "loss": 0.4217, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4052734375, + "rewards/margins": 7.0546875, + "rewards/rejected": -5.6484375, + "step": 1898 + }, + { + "epoch": 0.3768044049804058, + "grad_norm": 45.9312593967284, + "learning_rate": 8.060630305613683e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 3.91796875, + "logps/chosen": -1305.0, + "logps/rejected": -664.5, + "loss": 0.4563, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.642578125, + "rewards/margins": 6.625, + "rewards/rejected": -4.984375, + "step": 1899 + }, + { + "epoch": 0.3770028275212064, + "grad_norm": 34.341924950245975, + "learning_rate": 8.058066808172016e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.3046875, + "logps/chosen": -964.0, + "logps/rejected": -779.0, + "loss": 0.4732, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.57421875, + "rewards/margins": 6.9296875, + "rewards/rejected": -5.3515625, + "step": 1900 + }, + { + "epoch": 0.37720125006200705, + "grad_norm": 30.630101712186132, + "learning_rate": 8.055502083671139e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 3.8515625, + "logps/chosen": -1026.0, + "logps/rejected": -521.5, + "loss": 0.571, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.708984375, + "rewards/margins": 5.03125, + "rewards/rejected": -3.322265625, + "step": 1901 + }, + { + "epoch": 0.37739967260280766, + "grad_norm": 28.01319834282694, + "learning_rate": 8.052936133341307e-07, + "logits/chosen": 4.125, + "logits/rejected": 3.984375, + "logps/chosen": -1292.0, + "logps/rejected": -764.0, + "loss": 0.4014, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.609375, + "rewards/margins": 8.4296875, + "rewards/rejected": -5.84765625, + "step": 1902 + }, + { + "epoch": 0.3775980951436083, + "grad_norm": 36.4377493036158, + "learning_rate": 8.05036895841336e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.05859375, + "logps/chosen": -896.0, + "logps/rejected": -720.5, + "loss": 0.5084, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.4912109375, + "rewards/margins": 6.890625, + "rewards/rejected": -4.3984375, + "step": 1903 + }, + { + "epoch": 0.377796517684409, + "grad_norm": 34.50554685475972, + "learning_rate": 8.047800560118727e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 4.1953125, + "logps/chosen": -819.0, + "logps/rejected": -1478.0, + "loss": 0.5593, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.955078125, + "rewards/margins": 7.8671875, + "rewards/rejected": -6.90625, + "step": 1904 + }, + { + "epoch": 0.3779949402252096, + "grad_norm": 51.31662490788163, + "learning_rate": 8.045230939689424e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 3.8046875, + "logps/chosen": -746.0, + "logps/rejected": -726.0, + "loss": 0.5453, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.42138671875, + "rewards/margins": 5.2265625, + "rewards/rejected": -3.80859375, + "step": 1905 + }, + { + "epoch": 0.37819336276601023, + "grad_norm": 32.479793250029104, + "learning_rate": 8.042660098358049e-07, + "logits/chosen": 4.296875, + "logits/rejected": 3.828125, + "logps/chosen": -1186.0, + "logps/rejected": -701.75, + "loss": 0.3489, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.1220703125, + "rewards/margins": 7.8046875, + "rewards/rejected": -5.671875, + "step": 1906 + }, + { + "epoch": 0.37839178530681084, + "grad_norm": 34.73286709022736, + "learning_rate": 8.040088037357791e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.40234375, + "logps/chosen": -750.5, + "logps/rejected": -962.0, + "loss": 0.5136, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.72265625, + "rewards/margins": 7.1875, + "rewards/rejected": -5.4609375, + "step": 1907 + }, + { + "epoch": 0.3785902078476115, + "grad_norm": 31.832060077307098, + "learning_rate": 8.037514757922422e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 3.6171875, + "logps/chosen": -860.5, + "logps/rejected": -558.5, + "loss": 0.4689, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.7177734375, + "rewards/margins": 6.46875, + "rewards/rejected": -4.7421875, + "step": 1908 + }, + { + "epoch": 0.37878863038841215, + "grad_norm": 31.726640850019283, + "learning_rate": 8.034940261286298e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.3203125, + "logps/chosen": -1067.0, + "logps/rejected": -689.0, + "loss": 0.4636, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.884765625, + "rewards/margins": 6.6953125, + "rewards/rejected": -3.82421875, + "step": 1909 + }, + { + "epoch": 0.37898705292921275, + "grad_norm": 31.74074112822469, + "learning_rate": 8.032364548684361e-07, + "logits/chosen": 4.140625, + "logits/rejected": 3.6953125, + "logps/chosen": -1258.0, + "logps/rejected": -795.5, + "loss": 0.4151, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.060546875, + "rewards/margins": 7.3125, + "rewards/rejected": -5.24609375, + "step": 1910 + }, + { + "epoch": 0.3791854754700134, + "grad_norm": 26.031531080415927, + "learning_rate": 8.02978762135213e-07, + "logits/chosen": 4.44921875, + "logits/rejected": 4.8046875, + "logps/chosen": -1013.0, + "logps/rejected": -943.0, + "loss": 0.3607, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.220703125, + "rewards/margins": 8.40625, + "rewards/rejected": -6.1875, + "step": 1911 + }, + { + "epoch": 0.379383898010814, + "grad_norm": 25.940749766671058, + "learning_rate": 8.027209480525718e-07, + "logits/chosen": 3.875, + "logits/rejected": 4.0390625, + "logps/chosen": -950.0, + "logps/rejected": -682.5, + "loss": 0.3076, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5625, + "rewards/margins": 7.828125, + "rewards/rejected": -5.26953125, + "step": 1912 + }, + { + "epoch": 0.37958232055161467, + "grad_norm": 37.17688399005549, + "learning_rate": 8.024630127441808e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.06640625, + "logps/chosen": -976.0, + "logps/rejected": -803.0, + "loss": 0.4131, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.37890625, + "rewards/margins": 14.8828125, + "rewards/rejected": -12.515625, + "step": 1913 + }, + { + "epoch": 0.3797807430924153, + "grad_norm": 37.3089325309892, + "learning_rate": 8.022049563337671e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 3.6640625, + "logps/chosen": -800.0, + "logps/rejected": -562.5, + "loss": 0.3939, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.1796875, + "rewards/margins": 6.1875, + "rewards/rejected": -4.01171875, + "step": 1914 + }, + { + "epoch": 0.37997916563321593, + "grad_norm": 37.20061501076204, + "learning_rate": 8.019467789451158e-07, + "logits/chosen": 3.9375, + "logits/rejected": 4.13671875, + "logps/chosen": -1162.0, + "logps/rejected": -818.0, + "loss": 0.4087, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.01953125, + "rewards/margins": 7.7421875, + "rewards/rejected": -5.7265625, + "step": 1915 + }, + { + "epoch": 0.3801775881740166, + "grad_norm": 32.3959819915702, + "learning_rate": 8.016884807020703e-07, + "logits/chosen": 4.19921875, + "logits/rejected": 4.36328125, + "logps/chosen": -1041.0, + "logps/rejected": -968.0, + "loss": 0.3913, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.314453125, + "rewards/margins": 8.0, + "rewards/rejected": -5.68359375, + "step": 1916 + }, + { + "epoch": 0.3803760107148172, + "grad_norm": 48.719719122512814, + "learning_rate": 8.014300617285311e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.96484375, + "logps/chosen": -1196.0, + "logps/rejected": -1006.0, + "loss": 0.515, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.9228515625, + "rewards/margins": 6.7734375, + "rewards/rejected": -4.849609375, + "step": 1917 + }, + { + "epoch": 0.38057443325561785, + "grad_norm": 31.454796793056513, + "learning_rate": 8.011715221484579e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.4375, + "logps/chosen": -1312.0, + "logps/rejected": -962.0, + "loss": 0.3679, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.609375, + "rewards/margins": 9.0234375, + "rewards/rejected": -6.4140625, + "step": 1918 + }, + { + "epoch": 0.38077285579641845, + "grad_norm": 34.02871647995948, + "learning_rate": 8.009128620858667e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.26171875, + "logps/chosen": -667.0, + "logps/rejected": -562.5, + "loss": 0.4963, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.78515625, + "rewards/margins": 4.9609375, + "rewards/rejected": -3.17578125, + "step": 1919 + }, + { + "epoch": 0.3809712783372191, + "grad_norm": 42.41621239080979, + "learning_rate": 8.006540816648329e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.18359375, + "logps/chosen": -1083.0, + "logps/rejected": -1933.5, + "loss": 0.4525, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.0458984375, + "rewards/margins": 8.46875, + "rewards/rejected": -7.4140625, + "step": 1920 + }, + { + "epoch": 0.38116970087801977, + "grad_norm": 33.25790324641263, + "learning_rate": 8.003951810094884e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 4.01171875, + "logps/chosen": -1163.0, + "logps/rejected": -671.0, + "loss": 0.3245, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.240234375, + "rewards/margins": 7.6484375, + "rewards/rejected": -5.4140625, + "step": 1921 + }, + { + "epoch": 0.38136812341882037, + "grad_norm": 38.640821598530124, + "learning_rate": 8.001361602440235e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.34765625, + "logps/chosen": -1226.0, + "logps/rejected": -860.0, + "loss": 0.4018, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.21484375, + "rewards/margins": 8.3515625, + "rewards/rejected": -5.12109375, + "step": 1922 + }, + { + "epoch": 0.38156654595962103, + "grad_norm": 28.916067860310775, + "learning_rate": 7.998770194926857e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 4.21875, + "logps/chosen": -784.0, + "logps/rejected": -1247.0, + "loss": 0.4832, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.0986328125, + "rewards/margins": 7.06640625, + "rewards/rejected": -5.98046875, + "step": 1923 + }, + { + "epoch": 0.38176496850042163, + "grad_norm": 33.32151922539212, + "learning_rate": 7.996177588797804e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 4.28515625, + "logps/chosen": -972.0, + "logps/rejected": -817.0, + "loss": 0.3084, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.146484375, + "rewards/margins": 8.109375, + "rewards/rejected": -5.9609375, + "step": 1924 + }, + { + "epoch": 0.3819633910412223, + "grad_norm": 31.161855324445256, + "learning_rate": 7.993583785296701e-07, + "logits/chosen": 4.78125, + "logits/rejected": 4.8984375, + "logps/chosen": -972.0, + "logps/rejected": -874.0, + "loss": 0.3946, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.99609375, + "rewards/margins": 8.3984375, + "rewards/rejected": -6.41015625, + "step": 1925 + }, + { + "epoch": 0.3821618135820229, + "grad_norm": 30.93736834983363, + "learning_rate": 7.99098878566775e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.30859375, + "logps/chosen": -1149.0, + "logps/rejected": -752.5, + "loss": 0.2872, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.97265625, + "rewards/margins": 9.0625, + "rewards/rejected": -6.0859375, + "step": 1926 + }, + { + "epoch": 0.38236023612282355, + "grad_norm": 41.451681233787994, + "learning_rate": 7.988392591155727e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.30859375, + "logps/chosen": -860.5, + "logps/rejected": -662.0, + "loss": 0.5214, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.5625, + "rewards/margins": 5.0, + "rewards/rejected": -2.4453125, + "step": 1927 + }, + { + "epoch": 0.3825586586636242, + "grad_norm": 31.462362617439805, + "learning_rate": 7.98579520300598e-07, + "logits/chosen": 4.28515625, + "logits/rejected": 3.97265625, + "logps/chosen": -925.5, + "logps/rejected": -573.0, + "loss": 0.4228, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.357421875, + "rewards/margins": 6.556640625, + "rewards/rejected": -4.20556640625, + "step": 1928 + }, + { + "epoch": 0.3827570812044248, + "grad_norm": 42.88282144649774, + "learning_rate": 7.983196622464431e-07, + "logits/chosen": 4.625, + "logits/rejected": 4.5078125, + "logps/chosen": -1226.0, + "logps/rejected": -858.5, + "loss": 0.4241, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.015625, + "rewards/margins": 6.671875, + "rewards/rejected": -3.65234375, + "step": 1929 + }, + { + "epoch": 0.38295550374522547, + "grad_norm": 36.78652095315674, + "learning_rate": 7.980596850777571e-07, + "logits/chosen": 4.09375, + "logits/rejected": 3.94140625, + "logps/chosen": -960.0, + "logps/rejected": -619.5, + "loss": 0.4766, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2763671875, + "rewards/margins": 5.29296875, + "rewards/rejected": -4.02734375, + "step": 1930 + }, + { + "epoch": 0.38315392628602607, + "grad_norm": 30.872538863077597, + "learning_rate": 7.977995889192465e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.12890625, + "logps/chosen": -1094.0, + "logps/rejected": -826.0, + "loss": 0.3429, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.63671875, + "rewards/margins": 7.3203125, + "rewards/rejected": -4.69140625, + "step": 1931 + }, + { + "epoch": 0.3833523488268267, + "grad_norm": 40.91132502814732, + "learning_rate": 7.975393738956749e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.0390625, + "logps/chosen": -883.0, + "logps/rejected": -648.5, + "loss": 0.5236, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.302734375, + "rewards/margins": 5.87109375, + "rewards/rejected": -3.55859375, + "step": 1932 + }, + { + "epoch": 0.3835507713676274, + "grad_norm": 40.10599104727082, + "learning_rate": 7.972790401318627e-07, + "logits/chosen": 4.4765625, + "logits/rejected": 4.5625, + "logps/chosen": -987.0, + "logps/rejected": -745.0, + "loss": 0.4331, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.958984375, + "rewards/margins": 6.8515625, + "rewards/rejected": -4.8984375, + "step": 1933 + }, + { + "epoch": 0.383749193908428, + "grad_norm": 31.857065410903484, + "learning_rate": 7.970185877526875e-07, + "logits/chosen": 3.63671875, + "logits/rejected": 3.65625, + "logps/chosen": -810.0, + "logps/rejected": -575.5, + "loss": 0.4424, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6474609375, + "rewards/margins": 5.94140625, + "rewards/rejected": -4.2890625, + "step": 1934 + }, + { + "epoch": 0.38394761644922865, + "grad_norm": 33.299233754430986, + "learning_rate": 7.967580168830835e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 3.98046875, + "logps/chosen": -1048.0, + "logps/rejected": -814.0, + "loss": 0.3641, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.51171875, + "rewards/margins": 8.0625, + "rewards/rejected": -6.55859375, + "step": 1935 + }, + { + "epoch": 0.38414603899002925, + "grad_norm": 38.43479296625643, + "learning_rate": 7.964973276480421e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.09375, + "logps/chosen": -1180.0, + "logps/rejected": -1310.5, + "loss": 0.3999, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.361328125, + "rewards/margins": 8.6953125, + "rewards/rejected": -6.34375, + "step": 1936 + }, + { + "epoch": 0.3843444615308299, + "grad_norm": 37.459018693292, + "learning_rate": 7.962365201726112e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 4.1484375, + "logps/chosen": -1339.0, + "logps/rejected": -858.0, + "loss": 0.3287, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.91796875, + "rewards/margins": 8.453125, + "rewards/rejected": -5.5234375, + "step": 1937 + }, + { + "epoch": 0.38454288407163056, + "grad_norm": 32.80085005837471, + "learning_rate": 7.959755945818956e-07, + "logits/chosen": 3.59765625, + "logits/rejected": 3.65234375, + "logps/chosen": -836.5, + "logps/rejected": -597.5, + "loss": 0.3714, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.7890625, + "rewards/margins": 7.0859375, + "rewards/rejected": -5.296875, + "step": 1938 + }, + { + "epoch": 0.38474130661243117, + "grad_norm": 46.06935956807903, + "learning_rate": 7.957145510010566e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 3.7421875, + "logps/chosen": -652.0, + "logps/rejected": -712.5, + "loss": 0.6696, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7998046875, + "rewards/margins": 4.42578125, + "rewards/rejected": -3.6328125, + "step": 1939 + }, + { + "epoch": 0.3849397291532318, + "grad_norm": 31.754554282807618, + "learning_rate": 7.95453389555312e-07, + "logits/chosen": 3.84375, + "logits/rejected": 4.12109375, + "logps/chosen": -1059.0, + "logps/rejected": -1071.0, + "loss": 0.326, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.63671875, + "rewards/margins": 8.09375, + "rewards/rejected": -5.46484375, + "step": 1940 + }, + { + "epoch": 0.3851381516940324, + "grad_norm": 34.11092171906542, + "learning_rate": 7.951921103699368e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 3.83203125, + "logps/chosen": -1153.0, + "logps/rejected": -669.0, + "loss": 0.4301, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9765625, + "rewards/margins": 6.6328125, + "rewards/rejected": -4.6484375, + "step": 1941 + }, + { + "epoch": 0.3853365742348331, + "grad_norm": 43.715924892473524, + "learning_rate": 7.949307135702615e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 4.01953125, + "logps/chosen": -1075.0, + "logps/rejected": -723.0, + "loss": 0.5008, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.84375, + "rewards/margins": 5.484375, + "rewards/rejected": -3.63671875, + "step": 1942 + }, + { + "epoch": 0.3855349967756337, + "grad_norm": 46.826610883633975, + "learning_rate": 7.946691992816737e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.125, + "logps/chosen": -921.0, + "logps/rejected": -796.0, + "loss": 0.4608, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.37841796875, + "rewards/margins": 6.90625, + "rewards/rejected": -5.51953125, + "step": 1943 + }, + { + "epoch": 0.38573341931643434, + "grad_norm": 35.203638980202605, + "learning_rate": 7.94407567629617e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 4.01953125, + "logps/chosen": -1057.0, + "logps/rejected": -766.0, + "loss": 0.4333, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8211669921875, + "rewards/margins": 6.3984375, + "rewards/rejected": -4.56640625, + "step": 1944 + }, + { + "epoch": 0.385931841857235, + "grad_norm": 32.34011260093893, + "learning_rate": 7.941458187395917e-07, + "logits/chosen": 3.828125, + "logits/rejected": 3.64453125, + "logps/chosen": -738.0, + "logps/rejected": -713.5, + "loss": 0.5271, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.6083984375, + "rewards/margins": 5.8984375, + "rewards/rejected": -4.2890625, + "step": 1945 + }, + { + "epoch": 0.3861302643980356, + "grad_norm": 38.105730523157966, + "learning_rate": 7.93883952737154e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 4.1171875, + "logps/chosen": -939.0, + "logps/rejected": -678.0, + "loss": 0.4834, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.43408203125, + "rewards/margins": 5.6640625, + "rewards/rejected": -4.22265625, + "step": 1946 + }, + { + "epoch": 0.38632868693883626, + "grad_norm": 38.58910420065849, + "learning_rate": 7.936219697479163e-07, + "logits/chosen": 4.33984375, + "logits/rejected": 4.3671875, + "logps/chosen": -932.0, + "logps/rejected": -796.0, + "loss": 0.4846, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.9921875, + "rewards/margins": 5.91015625, + "rewards/rejected": -3.91015625, + "step": 1947 + }, + { + "epoch": 0.38652710947963687, + "grad_norm": 32.16208646514428, + "learning_rate": 7.933598698975472e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 4.06640625, + "logps/chosen": -837.0, + "logps/rejected": -545.5, + "loss": 0.3524, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.0947265625, + "rewards/margins": 6.3671875, + "rewards/rejected": -4.27734375, + "step": 1948 + }, + { + "epoch": 0.3867255320204375, + "grad_norm": 31.68229094313502, + "learning_rate": 7.930976533117715e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.20703125, + "logps/chosen": -1011.0, + "logps/rejected": -677.5, + "loss": 0.4748, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.232421875, + "rewards/margins": 5.92578125, + "rewards/rejected": -3.68359375, + "step": 1949 + }, + { + "epoch": 0.3869239545612382, + "grad_norm": 39.54182352154494, + "learning_rate": 7.928353201163697e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 4.08984375, + "logps/chosen": -1119.0, + "logps/rejected": -659.0, + "loss": 0.377, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.984375, + "rewards/margins": 8.875, + "rewards/rejected": -5.88671875, + "step": 1950 + }, + { + "epoch": 0.3871223771020388, + "grad_norm": 36.79345276019466, + "learning_rate": 7.925728704371784e-07, + "logits/chosen": 4.53125, + "logits/rejected": 4.4453125, + "logps/chosen": -1297.0, + "logps/rejected": -1077.5, + "loss": 0.388, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.8740234375, + "rewards/margins": 9.25, + "rewards/rejected": -7.375, + "step": 1951 + }, + { + "epoch": 0.38732079964283944, + "grad_norm": 36.442056978493625, + "learning_rate": 7.923103044000899e-07, + "logits/chosen": 3.78125, + "logits/rejected": 4.04296875, + "logps/chosen": -744.0, + "logps/rejected": -510.5, + "loss": 0.502, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.599609375, + "rewards/margins": 5.390625, + "rewards/rejected": -3.7890625, + "step": 1952 + }, + { + "epoch": 0.38751922218364004, + "grad_norm": 31.3759062265709, + "learning_rate": 7.920476221310526e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.00390625, + "logps/chosen": -1300.0, + "logps/rejected": -1345.0, + "loss": 0.2619, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.05078125, + "rewards/margins": 10.0546875, + "rewards/rejected": -6.9921875, + "step": 1953 + }, + { + "epoch": 0.3877176447244407, + "grad_norm": 28.839445142898423, + "learning_rate": 7.917848237560708e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.08203125, + "logps/chosen": -1335.0, + "logps/rejected": -967.0, + "loss": 0.4085, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.037109375, + "rewards/margins": 6.7421875, + "rewards/rejected": -5.703125, + "step": 1954 + }, + { + "epoch": 0.3879160672652413, + "grad_norm": 33.342580297651814, + "learning_rate": 7.915219094012036e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 3.81640625, + "logps/chosen": -973.5, + "logps/rejected": -848.5, + "loss": 0.3673, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.375, + "rewards/margins": 7.9921875, + "rewards/rejected": -5.61328125, + "step": 1955 + }, + { + "epoch": 0.38811448980604196, + "grad_norm": 35.208294357442455, + "learning_rate": 7.91258879192567e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 3.8984375, + "logps/chosen": -1049.0, + "logps/rejected": -1699.0, + "loss": 0.4727, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.6533203125, + "rewards/margins": 9.0546875, + "rewards/rejected": -7.3984375, + "step": 1956 + }, + { + "epoch": 0.3883129123468426, + "grad_norm": 34.3352815889517, + "learning_rate": 7.909957332563312e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 4.4375, + "logps/chosen": -692.5, + "logps/rejected": -589.5, + "loss": 0.532, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.4970703125, + "rewards/margins": 5.80859375, + "rewards/rejected": -4.3046875, + "step": 1957 + }, + { + "epoch": 0.3885113348876432, + "grad_norm": 32.5265496613888, + "learning_rate": 7.907324717187234e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 4.1875, + "logps/chosen": -888.0, + "logps/rejected": -610.5, + "loss": 0.3364, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.287109375, + "rewards/margins": 7.03125, + "rewards/rejected": -4.734375, + "step": 1958 + }, + { + "epoch": 0.3887097574284439, + "grad_norm": 33.686186946540936, + "learning_rate": 7.904690947060248e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 3.52734375, + "logps/chosen": -1191.0, + "logps/rejected": -942.0, + "loss": 0.4737, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.7884521484375, + "rewards/margins": 7.0390625, + "rewards/rejected": -5.25390625, + "step": 1959 + }, + { + "epoch": 0.3889081799692445, + "grad_norm": 31.42188252849987, + "learning_rate": 7.902056023445733e-07, + "logits/chosen": 4.09375, + "logits/rejected": 4.18359375, + "logps/chosen": -1176.0, + "logps/rejected": -909.0, + "loss": 0.4002, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.90234375, + "rewards/margins": 8.390625, + "rewards/rejected": -6.5, + "step": 1960 + }, + { + "epoch": 0.38910660251004514, + "grad_norm": 37.54334611035941, + "learning_rate": 7.899419947607611e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.3046875, + "logps/chosen": -982.0, + "logps/rejected": -1393.0, + "loss": 0.4185, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1982421875, + "rewards/margins": 9.96875, + "rewards/rejected": -7.75390625, + "step": 1961 + }, + { + "epoch": 0.3893050250508458, + "grad_norm": 35.740878764323895, + "learning_rate": 7.896782720810361e-07, + "logits/chosen": 4.39453125, + "logits/rejected": 4.7421875, + "logps/chosen": -855.0, + "logps/rejected": -1293.0, + "loss": 0.5018, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4248046875, + "rewards/margins": 8.984375, + "rewards/rejected": -6.5390625, + "step": 1962 + }, + { + "epoch": 0.3895034475916464, + "grad_norm": 37.08254446991293, + "learning_rate": 7.894144344319013e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.22265625, + "logps/chosen": -1057.0, + "logps/rejected": -875.0, + "loss": 0.4437, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.3828125, + "rewards/margins": 10.3828125, + "rewards/rejected": -9.015625, + "step": 1963 + }, + { + "epoch": 0.38970187013244706, + "grad_norm": 43.9340334539318, + "learning_rate": 7.891504819399152e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 4.18359375, + "logps/chosen": -1015.0, + "logps/rejected": -799.0, + "loss": 0.4835, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.369140625, + "rewards/margins": 6.046875, + "rewards/rejected": -4.671875, + "step": 1964 + }, + { + "epoch": 0.38990029267324766, + "grad_norm": 36.34720809692305, + "learning_rate": 7.888864147316911e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.453125, + "logps/chosen": -524.5, + "logps/rejected": -646.0, + "loss": 0.7222, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.06005859375, + "rewards/margins": 4.00390625, + "rewards/rejected": -2.9482421875, + "step": 1965 + }, + { + "epoch": 0.3900987152140483, + "grad_norm": 31.671671832195244, + "learning_rate": 7.886222329338974e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 3.87890625, + "logps/chosen": -882.0, + "logps/rejected": -598.5, + "loss": 0.4228, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.9892578125, + "rewards/margins": 6.2109375, + "rewards/rejected": -4.23046875, + "step": 1966 + }, + { + "epoch": 0.390297137754849, + "grad_norm": 36.23850550916606, + "learning_rate": 7.883579366732571e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.4765625, + "logps/chosen": -886.0, + "logps/rejected": -956.0, + "loss": 0.4414, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1171875, + "rewards/margins": 7.546875, + "rewards/rejected": -5.4140625, + "step": 1967 + }, + { + "epoch": 0.3904955602956496, + "grad_norm": 37.34568654386726, + "learning_rate": 7.880935260765486e-07, + "logits/chosen": 4.453125, + "logits/rejected": 4.3984375, + "logps/chosen": -1177.0, + "logps/rejected": -959.0, + "loss": 0.4209, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.41796875, + "rewards/margins": 8.3046875, + "rewards/rejected": -5.8828125, + "step": 1968 + }, + { + "epoch": 0.39069398283645024, + "grad_norm": 41.92682280006148, + "learning_rate": 7.87829001270605e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 4.1015625, + "logps/chosen": -916.0, + "logps/rejected": -820.0, + "loss": 0.4597, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.490234375, + "rewards/margins": 15.97265625, + "rewards/rejected": -14.421875, + "step": 1969 + }, + { + "epoch": 0.39089240537725084, + "grad_norm": 29.182541115970565, + "learning_rate": 7.87564362382314e-07, + "logits/chosen": 4.546875, + "logits/rejected": 4.1015625, + "logps/chosen": -1135.0, + "logps/rejected": -704.0, + "loss": 0.4626, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.873046875, + "rewards/margins": 6.8359375, + "rewards/rejected": -3.95703125, + "step": 1970 + }, + { + "epoch": 0.3910908279180515, + "grad_norm": 34.9661511288282, + "learning_rate": 7.872996095386184e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.1171875, + "logps/chosen": -871.0, + "logps/rejected": -602.5, + "loss": 0.5139, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.647705078125, + "rewards/margins": 5.3671875, + "rewards/rejected": -3.72265625, + "step": 1971 + }, + { + "epoch": 0.3912892504588521, + "grad_norm": 30.646868109362586, + "learning_rate": 7.870347428665153e-07, + "logits/chosen": 4.5234375, + "logits/rejected": 4.4375, + "logps/chosen": -1094.0, + "logps/rejected": -1009.0, + "loss": 0.4699, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.0380859375, + "rewards/margins": 7.546875, + "rewards/rejected": -5.5078125, + "step": 1972 + }, + { + "epoch": 0.39148767299965276, + "grad_norm": 32.49159542365023, + "learning_rate": 7.867697624930565e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.30859375, + "logps/chosen": -1064.0, + "logps/rejected": -692.5, + "loss": 0.4974, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.41796875, + "rewards/margins": 6.31640625, + "rewards/rejected": -3.89453125, + "step": 1973 + }, + { + "epoch": 0.3916860955404534, + "grad_norm": 44.3947329276044, + "learning_rate": 7.865046685453485e-07, + "logits/chosen": 4.44140625, + "logits/rejected": 4.01171875, + "logps/chosen": -853.0, + "logps/rejected": -558.0, + "loss": 0.4344, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.228515625, + "rewards/margins": 5.75, + "rewards/rejected": -3.51953125, + "step": 1974 + }, + { + "epoch": 0.391884518081254, + "grad_norm": 39.99828859849703, + "learning_rate": 7.862394611505519e-07, + "logits/chosen": 3.96875, + "logits/rejected": 3.98828125, + "logps/chosen": -936.0, + "logps/rejected": -665.5, + "loss": 0.4582, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.71435546875, + "rewards/margins": 7.203125, + "rewards/rejected": -5.48046875, + "step": 1975 + }, + { + "epoch": 0.3920829406220547, + "grad_norm": 29.094009717685864, + "learning_rate": 7.859741404358825e-07, + "logits/chosen": 4.46875, + "logits/rejected": 4.2109375, + "logps/chosen": -710.0, + "logps/rejected": -530.5, + "loss": 0.4547, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.84765625, + "rewards/margins": 5.5078125, + "rewards/rejected": -3.66796875, + "step": 1976 + }, + { + "epoch": 0.3922813631628553, + "grad_norm": 35.090629209360095, + "learning_rate": 7.857087065286094e-07, + "logits/chosen": 4.05078125, + "logits/rejected": 4.15625, + "logps/chosen": -1024.0, + "logps/rejected": -786.0, + "loss": 0.506, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.65625, + "rewards/margins": 6.65625, + "rewards/rejected": -4.013671875, + "step": 1977 + }, + { + "epoch": 0.39247978570365594, + "grad_norm": 35.48247677018976, + "learning_rate": 7.854431595560569e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.953125, + "logps/chosen": -832.0, + "logps/rejected": -579.5, + "loss": 0.5446, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.78515625, + "rewards/margins": 5.3125, + "rewards/rejected": -3.525390625, + "step": 1978 + }, + { + "epoch": 0.3926782082444566, + "grad_norm": 30.13163115030329, + "learning_rate": 7.851774996456028e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.28515625, + "logps/chosen": -973.5, + "logps/rejected": -1194.5, + "loss": 0.4193, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.068359375, + "rewards/margins": 8.19140625, + "rewards/rejected": -6.12109375, + "step": 1979 + }, + { + "epoch": 0.3928766307852572, + "grad_norm": 28.722681915346794, + "learning_rate": 7.849117269246798e-07, + "logits/chosen": 4.51953125, + "logits/rejected": 4.59375, + "logps/chosen": -918.0, + "logps/rejected": -613.5, + "loss": 0.5091, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.66552734375, + "rewards/margins": 5.546875, + "rewards/rejected": -3.87890625, + "step": 1980 + }, + { + "epoch": 0.39307505332605785, + "grad_norm": 39.956406094279146, + "learning_rate": 7.846458415207741e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 4.25390625, + "logps/chosen": -1345.5, + "logps/rejected": -1002.0, + "loss": 0.3739, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.361328125, + "rewards/margins": 8.47265625, + "rewards/rejected": -6.109375, + "step": 1981 + }, + { + "epoch": 0.39327347586685846, + "grad_norm": 28.282043307576757, + "learning_rate": 7.843798435614264e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 3.88671875, + "logps/chosen": -1093.0, + "logps/rejected": -870.0, + "loss": 0.4365, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.92578125, + "rewards/margins": 9.5625, + "rewards/rejected": -7.66015625, + "step": 1982 + }, + { + "epoch": 0.3934718984076591, + "grad_norm": 33.47034201276966, + "learning_rate": 7.841137331742311e-07, + "logits/chosen": 4.4921875, + "logits/rejected": 4.41796875, + "logps/chosen": -873.5, + "logps/rejected": -592.5, + "loss": 0.4744, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.7763671875, + "rewards/margins": 5.9453125, + "rewards/rejected": -4.171875, + "step": 1983 + }, + { + "epoch": 0.3936703209484597, + "grad_norm": 34.83394375989804, + "learning_rate": 7.838475104868364e-07, + "logits/chosen": 4.53125, + "logits/rejected": 4.55078125, + "logps/chosen": -985.0, + "logps/rejected": -687.0, + "loss": 0.4651, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.642578125, + "rewards/margins": 6.8046875, + "rewards/rejected": -4.17578125, + "step": 1984 + }, + { + "epoch": 0.3938687434892604, + "grad_norm": 35.062932909163585, + "learning_rate": 7.83581175626945e-07, + "logits/chosen": 3.953125, + "logits/rejected": 4.09375, + "logps/chosen": -1027.0, + "logps/rejected": -756.0, + "loss": 0.3477, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.744140625, + "rewards/margins": 7.0390625, + "rewards/rejected": -5.2734375, + "step": 1985 + }, + { + "epoch": 0.39406716603006103, + "grad_norm": 41.95342709591603, + "learning_rate": 7.833147287223128e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.0546875, + "logps/chosen": -1296.0, + "logps/rejected": -831.0, + "loss": 0.448, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.74853515625, + "rewards/margins": 8.3046875, + "rewards/rejected": -6.5703125, + "step": 1986 + }, + { + "epoch": 0.39426558857086164, + "grad_norm": 42.53788429961387, + "learning_rate": 7.830481699007496e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.125, + "logps/chosen": -1506.0, + "logps/rejected": -943.0, + "loss": 0.4167, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.69921875, + "rewards/margins": 7.09375, + "rewards/rejected": -4.390625, + "step": 1987 + }, + { + "epoch": 0.3944640111116623, + "grad_norm": 32.3431542984099, + "learning_rate": 7.82781499290119e-07, + "logits/chosen": 3.64453125, + "logits/rejected": 4.07421875, + "logps/chosen": -1056.0, + "logps/rejected": -2159.0, + "loss": 0.3645, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.22265625, + "rewards/margins": 17.1328125, + "rewards/rejected": -14.91796875, + "step": 1988 + }, + { + "epoch": 0.3946624336524629, + "grad_norm": 30.351953672528797, + "learning_rate": 7.825147170183384e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.35546875, + "logps/chosen": -1217.0, + "logps/rejected": -679.0, + "loss": 0.4673, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.41015625, + "rewards/margins": 8.0625, + "rewards/rejected": -5.6640625, + "step": 1989 + }, + { + "epoch": 0.39486085619326355, + "grad_norm": 35.84236766136579, + "learning_rate": 7.822478232133779e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 3.9453125, + "logps/chosen": -1076.0, + "logps/rejected": -663.5, + "loss": 0.3955, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.353515625, + "rewards/margins": 7.71875, + "rewards/rejected": -5.36328125, + "step": 1990 + }, + { + "epoch": 0.3950592787340642, + "grad_norm": 26.50981097280568, + "learning_rate": 7.819808180032622e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 4.01953125, + "logps/chosen": -1097.0, + "logps/rejected": -789.0, + "loss": 0.2843, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9140625, + "rewards/margins": 8.4921875, + "rewards/rejected": -5.56640625, + "step": 1991 + }, + { + "epoch": 0.3952577012748648, + "grad_norm": 35.46412921855235, + "learning_rate": 7.81713701516069e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.43359375, + "logps/chosen": -981.5, + "logps/rejected": -940.0, + "loss": 0.4335, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.322265625, + "rewards/margins": 8.09375, + "rewards/rejected": -5.78125, + "step": 1992 + }, + { + "epoch": 0.3954561238156655, + "grad_norm": 35.47802570224852, + "learning_rate": 7.814464738799291e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.640625, + "logps/chosen": -732.0, + "logps/rejected": -955.5, + "loss": 0.5071, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.26171875, + "rewards/margins": 6.640625, + "rewards/rejected": -4.380859375, + "step": 1993 + }, + { + "epoch": 0.3956545463564661, + "grad_norm": 36.10957850547057, + "learning_rate": 7.811791352230268e-07, + "logits/chosen": 3.8828125, + "logits/rejected": 3.703125, + "logps/chosen": -639.0, + "logps/rejected": -568.0, + "loss": 0.5193, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.288818359375, + "rewards/margins": 4.53125, + "rewards/rejected": -3.24609375, + "step": 1994 + }, + { + "epoch": 0.39585296889726673, + "grad_norm": 25.853280943822913, + "learning_rate": 7.809116856736e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 3.9921875, + "logps/chosen": -1142.0, + "logps/rejected": -781.0, + "loss": 0.32, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.951171875, + "rewards/margins": 7.90625, + "rewards/rejected": -4.9609375, + "step": 1995 + }, + { + "epoch": 0.3960513914380674, + "grad_norm": 36.54211149287338, + "learning_rate": 7.806441253599393e-07, + "logits/chosen": 4.3515625, + "logits/rejected": 4.1796875, + "logps/chosen": -931.5, + "logps/rejected": -663.0, + "loss": 0.5256, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.490997314453125, + "rewards/margins": 5.29296875, + "rewards/rejected": -3.78515625, + "step": 1996 + }, + { + "epoch": 0.396249813978868, + "grad_norm": 33.7583690809944, + "learning_rate": 7.803764544103885e-07, + "logits/chosen": 3.87109375, + "logits/rejected": 3.97265625, + "logps/chosen": -1078.0, + "logps/rejected": -690.0, + "loss": 0.3391, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.05078125, + "rewards/margins": 7.4140625, + "rewards/rejected": -5.3671875, + "step": 1997 + }, + { + "epoch": 0.39644823651966865, + "grad_norm": 30.76894772313314, + "learning_rate": 7.801086729533452e-07, + "logits/chosen": 4.0, + "logits/rejected": 3.9921875, + "logps/chosen": -759.0, + "logps/rejected": -837.0, + "loss": 0.41, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.896484375, + "rewards/margins": 18.765625, + "rewards/rejected": -16.85546875, + "step": 1998 + }, + { + "epoch": 0.39664665906046925, + "grad_norm": 33.270425305530445, + "learning_rate": 7.798407811172586e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.13671875, + "logps/chosen": -947.0, + "logps/rejected": -841.0, + "loss": 0.3707, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.259765625, + "rewards/margins": 8.21875, + "rewards/rejected": -5.95703125, + "step": 1999 + }, + { + "epoch": 0.3968450816012699, + "grad_norm": 29.796552502183776, + "learning_rate": 7.79572779030632e-07, + "logits/chosen": 4.46875, + "logits/rejected": 4.32421875, + "logps/chosen": -1630.0, + "logps/rejected": -658.0, + "loss": 0.4628, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.712890625, + "rewards/margins": 5.2578125, + "rewards/rejected": -4.5546875, + "step": 2000 + }, + { + "epoch": 0.3970435041420705, + "grad_norm": 40.22675498826341, + "learning_rate": 7.793046668220213e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.3515625, + "logps/chosen": -759.5, + "logps/rejected": -617.0, + "loss": 0.5392, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.9013671875, + "rewards/margins": 4.8984375, + "rewards/rejected": -2.994140625, + "step": 2001 + }, + { + "epoch": 0.39724192668287117, + "grad_norm": 32.3102032310204, + "learning_rate": 7.790364446200352e-07, + "logits/chosen": 3.77734375, + "logits/rejected": 3.5390625, + "logps/chosen": -791.0, + "logps/rejected": -465.5, + "loss": 0.4899, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.703125, + "rewards/margins": 5.16796875, + "rewards/rejected": -3.466796875, + "step": 2002 + }, + { + "epoch": 0.39744034922367183, + "grad_norm": 29.86128704309672, + "learning_rate": 7.78768112553335e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.21484375, + "logps/chosen": -911.5, + "logps/rejected": -720.0, + "loss": 0.5556, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.6171875, + "rewards/margins": 5.2880859375, + "rewards/rejected": -3.666015625, + "step": 2003 + }, + { + "epoch": 0.39763877176447243, + "grad_norm": 28.63407149408201, + "learning_rate": 7.784996707506349e-07, + "logits/chosen": 4.5703125, + "logits/rejected": 4.859375, + "logps/chosen": -862.5, + "logps/rejected": -712.0, + "loss": 0.4524, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.21533203125, + "rewards/margins": 6.37109375, + "rewards/rejected": -4.15234375, + "step": 2004 + }, + { + "epoch": 0.3978371943052731, + "grad_norm": 35.039567086073966, + "learning_rate": 7.782311193407015e-07, + "logits/chosen": 3.54296875, + "logits/rejected": 3.828125, + "logps/chosen": -753.0, + "logps/rejected": -633.5, + "loss": 0.5644, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.9716796875, + "rewards/margins": 3.80859375, + "rewards/rejected": -2.837890625, + "step": 2005 + }, + { + "epoch": 0.3980356168460737, + "grad_norm": 40.692050946032, + "learning_rate": 7.779624584523545e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 3.65234375, + "logps/chosen": -1246.0, + "logps/rejected": -615.5, + "loss": 0.3994, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.75, + "rewards/margins": 6.6796875, + "rewards/rejected": -3.93359375, + "step": 2006 + }, + { + "epoch": 0.39823403938687435, + "grad_norm": 27.14170071560557, + "learning_rate": 7.776936882144655e-07, + "logits/chosen": 4.5859375, + "logits/rejected": 4.2421875, + "logps/chosen": -1183.0, + "logps/rejected": -800.0, + "loss": 0.3344, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.390625, + "rewards/margins": 7.984375, + "rewards/rejected": -4.578125, + "step": 2007 + }, + { + "epoch": 0.398432461927675, + "grad_norm": 45.274962388749906, + "learning_rate": 7.774248087559589e-07, + "logits/chosen": 4.37109375, + "logits/rejected": 4.203125, + "logps/chosen": -1295.0, + "logps/rejected": -831.0, + "loss": 0.5276, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.30078125, + "rewards/margins": 4.73828125, + "rewards/rejected": -2.439453125, + "step": 2008 + }, + { + "epoch": 0.3986308844684756, + "grad_norm": 29.552358358559456, + "learning_rate": 7.771558202058112e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.2265625, + "logps/chosen": -956.5, + "logps/rejected": -785.0, + "loss": 0.4365, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.228515625, + "rewards/margins": 6.8984375, + "rewards/rejected": -4.68359375, + "step": 2009 + }, + { + "epoch": 0.39882930700927627, + "grad_norm": 29.750615522947825, + "learning_rate": 7.768867226930518e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.16796875, + "logps/chosen": -970.0, + "logps/rejected": -540.5, + "loss": 0.3453, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.46875, + "rewards/margins": 6.0859375, + "rewards/rejected": -3.6171875, + "step": 2010 + }, + { + "epoch": 0.39902772955007687, + "grad_norm": 40.85729508264731, + "learning_rate": 7.766175163467614e-07, + "logits/chosen": 3.72265625, + "logits/rejected": 3.99609375, + "logps/chosen": -869.5, + "logps/rejected": -878.0, + "loss": 0.5724, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.358642578125, + "rewards/margins": 5.3984375, + "rewards/rejected": -4.0390625, + "step": 2011 + }, + { + "epoch": 0.39922615209087753, + "grad_norm": 37.43873854293179, + "learning_rate": 7.763482012960743e-07, + "logits/chosen": 4.3984375, + "logits/rejected": 4.1484375, + "logps/chosen": -1222.0, + "logps/rejected": -677.0, + "loss": 0.4571, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.75390625, + "rewards/margins": 6.38671875, + "rewards/rejected": -3.630859375, + "step": 2012 + }, + { + "epoch": 0.39942457463167813, + "grad_norm": 29.057601947102455, + "learning_rate": 7.760787776701754e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 3.9921875, + "logps/chosen": -881.0, + "logps/rejected": -1020.0, + "loss": 0.4154, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.052734375, + "rewards/margins": 7.5546875, + "rewards/rejected": -5.48828125, + "step": 2013 + }, + { + "epoch": 0.3996229971724788, + "grad_norm": 36.952638649698756, + "learning_rate": 7.758092455983029e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 4.4375, + "logps/chosen": -992.0, + "logps/rejected": -933.0, + "loss": 0.482, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.37890625, + "rewards/margins": 7.26171875, + "rewards/rejected": -4.87890625, + "step": 2014 + }, + { + "epoch": 0.39982141971327945, + "grad_norm": 27.807079373953314, + "learning_rate": 7.755396052097462e-07, + "logits/chosen": 4.0625, + "logits/rejected": 3.99609375, + "logps/chosen": -951.0, + "logps/rejected": -660.0, + "loss": 0.4384, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.080078125, + "rewards/margins": 7.4140625, + "rewards/rejected": -5.330078125, + "step": 2015 + }, + { + "epoch": 0.40001984225408005, + "grad_norm": 36.22331321507354, + "learning_rate": 7.752698566338475e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.1328125, + "logps/chosen": -1245.0, + "logps/rejected": -1062.0, + "loss": 0.4054, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.09765625, + "rewards/margins": 7.921875, + "rewards/rejected": -4.828125, + "step": 2016 + }, + { + "epoch": 0.4002182647948807, + "grad_norm": 28.274412205198477, + "learning_rate": 7.75e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.2265625, + "logps/chosen": -761.0, + "logps/rejected": -771.5, + "loss": 0.5785, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.701171875, + "rewards/margins": 16.23828125, + "rewards/rejected": -14.49609375, + "step": 2017 + }, + { + "epoch": 0.4004166873356813, + "grad_norm": 32.4299334759887, + "learning_rate": 7.747300354376493e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 4.296875, + "logps/chosen": -1130.0, + "logps/rejected": -888.0, + "loss": 0.4031, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.900390625, + "rewards/margins": 8.03125, + "rewards/rejected": -5.125, + "step": 2018 + }, + { + "epoch": 0.40061510987648197, + "grad_norm": 33.497813845344645, + "learning_rate": 7.744599630762923e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.484375, + "logps/chosen": -1081.0, + "logps/rejected": -1100.5, + "loss": 0.4797, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.2265625, + "rewards/margins": 17.4609375, + "rewards/rejected": -15.23828125, + "step": 2019 + }, + { + "epoch": 0.4008135324172826, + "grad_norm": 31.82965425402204, + "learning_rate": 7.741897830454783e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.4609375, + "logps/chosen": -1078.0, + "logps/rejected": -842.0, + "loss": 0.4335, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.197265625, + "rewards/margins": 6.6015625, + "rewards/rejected": -4.42578125, + "step": 2020 + }, + { + "epoch": 0.4010119549580832, + "grad_norm": 36.04986301166811, + "learning_rate": 7.739194954748077e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 3.890625, + "logps/chosen": -961.0, + "logps/rejected": -621.0, + "loss": 0.4071, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.5390625, + "rewards/margins": 6.546875, + "rewards/rejected": -3.013671875, + "step": 2021 + }, + { + "epoch": 0.4012103774988839, + "grad_norm": 38.03010627384919, + "learning_rate": 7.736491004939328e-07, + "logits/chosen": 4.35546875, + "logits/rejected": 4.5703125, + "logps/chosen": -944.0, + "logps/rejected": -917.0, + "loss": 0.4756, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.568359375, + "rewards/margins": 8.0234375, + "rewards/rejected": -6.4609375, + "step": 2022 + }, + { + "epoch": 0.4014088000396845, + "grad_norm": 30.863580022519596, + "learning_rate": 7.73378598232557e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.00390625, + "logps/chosen": -889.0, + "logps/rejected": -606.0, + "loss": 0.3494, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.38671875, + "rewards/margins": 7.3671875, + "rewards/rejected": -4.97265625, + "step": 2023 + }, + { + "epoch": 0.40160722258048515, + "grad_norm": 31.16682962160441, + "learning_rate": 7.731079888204357e-07, + "logits/chosen": 4.5625, + "logits/rejected": 4.5546875, + "logps/chosen": -1414.0, + "logps/rejected": -963.0, + "loss": 0.3299, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.25390625, + "rewards/margins": 9.3359375, + "rewards/rejected": -6.08984375, + "step": 2024 + }, + { + "epoch": 0.4018056451212858, + "grad_norm": 34.17849512025232, + "learning_rate": 7.728372723873751e-07, + "logits/chosen": 4.0625, + "logits/rejected": 4.00390625, + "logps/chosen": -1352.0, + "logps/rejected": -744.0, + "loss": 0.3309, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.41015625, + "rewards/margins": 7.484375, + "rewards/rejected": -5.08203125, + "step": 2025 + }, + { + "epoch": 0.4020040676620864, + "grad_norm": 33.60325606895586, + "learning_rate": 7.725664490632333e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 3.8359375, + "logps/chosen": -1048.0, + "logps/rejected": -712.0, + "loss": 0.364, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.65625, + "rewards/margins": 7.9453125, + "rewards/rejected": -5.296875, + "step": 2026 + }, + { + "epoch": 0.40220249020288706, + "grad_norm": 37.37115272886485, + "learning_rate": 7.722955189779194e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 4.1171875, + "logps/chosen": -800.0, + "logps/rejected": -578.0, + "loss": 0.5349, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.71875, + "rewards/margins": 4.447265625, + "rewards/rejected": -3.73046875, + "step": 2027 + }, + { + "epoch": 0.40240091274368767, + "grad_norm": 26.84436114470687, + "learning_rate": 7.720244822613937e-07, + "logits/chosen": 3.796875, + "logits/rejected": 3.6640625, + "logps/chosen": -1024.0, + "logps/rejected": -802.0, + "loss": 0.4335, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.63671875, + "rewards/margins": 7.3203125, + "rewards/rejected": -4.6875, + "step": 2028 + }, + { + "epoch": 0.4025993352844883, + "grad_norm": 32.80848176706004, + "learning_rate": 7.717533390436679e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.34765625, + "logps/chosen": -887.5, + "logps/rejected": -703.0, + "loss": 0.5212, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.05078125, + "rewards/margins": 6.53515625, + "rewards/rejected": -4.486328125, + "step": 2029 + }, + { + "epoch": 0.4027977578252889, + "grad_norm": 35.55202810505876, + "learning_rate": 7.714820894548043e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.28125, + "logps/chosen": -932.0, + "logps/rejected": -1258.0, + "loss": 0.4361, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9765625, + "rewards/margins": 7.40625, + "rewards/rejected": -5.41796875, + "step": 2030 + }, + { + "epoch": 0.4029961803660896, + "grad_norm": 34.75021845506865, + "learning_rate": 7.712107336249168e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.25, + "logps/chosen": -1380.0, + "logps/rejected": -825.0, + "loss": 0.3515, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.01953125, + "rewards/margins": 8.2890625, + "rewards/rejected": -5.28125, + "step": 2031 + }, + { + "epoch": 0.40319460290689024, + "grad_norm": 32.69829744168848, + "learning_rate": 7.709392716841697e-07, + "logits/chosen": 4.34765625, + "logits/rejected": 3.9921875, + "logps/chosen": -1079.0, + "logps/rejected": -707.5, + "loss": 0.4105, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.80859375, + "rewards/margins": 7.0859375, + "rewards/rejected": -4.2734375, + "step": 2032 + }, + { + "epoch": 0.40339302544769085, + "grad_norm": 29.11955285023472, + "learning_rate": 7.706677037627784e-07, + "logits/chosen": 3.52734375, + "logits/rejected": 3.578125, + "logps/chosen": -786.0, + "logps/rejected": -540.5, + "loss": 0.4891, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.23974609375, + "rewards/margins": 5.0, + "rewards/rejected": -3.76953125, + "step": 2033 + }, + { + "epoch": 0.4035914479884915, + "grad_norm": 30.059490924734117, + "learning_rate": 7.703960299910097e-07, + "logits/chosen": 4.6328125, + "logits/rejected": 4.5546875, + "logps/chosen": -907.5, + "logps/rejected": -696.0, + "loss": 0.3953, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.376953125, + "rewards/margins": 6.6796875, + "rewards/rejected": -4.29296875, + "step": 2034 + }, + { + "epoch": 0.4037898705292921, + "grad_norm": 44.41029423778604, + "learning_rate": 7.701242504991802e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.65234375, + "logps/chosen": -987.0, + "logps/rejected": -789.5, + "loss": 0.4652, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.58203125, + "rewards/margins": 6.12109375, + "rewards/rejected": -4.54296875, + "step": 2035 + }, + { + "epoch": 0.40398829307009276, + "grad_norm": 35.940176458238746, + "learning_rate": 7.698523654176578e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.0859375, + "logps/chosen": -520.0, + "logps/rejected": -554.0, + "loss": 0.4652, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.94921875, + "rewards/margins": 5.3828125, + "rewards/rejected": -3.4345703125, + "step": 2036 + }, + { + "epoch": 0.4041867156108934, + "grad_norm": 42.137265070488496, + "learning_rate": 7.695803748768611e-07, + "logits/chosen": 3.54296875, + "logits/rejected": 3.859375, + "logps/chosen": -941.0, + "logps/rejected": -686.0, + "loss": 0.4944, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.927734375, + "rewards/margins": 7.875, + "rewards/rejected": -5.96484375, + "step": 2037 + }, + { + "epoch": 0.404385138151694, + "grad_norm": 25.05248045457878, + "learning_rate": 7.69308279007259e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 3.96875, + "logps/chosen": -1379.0, + "logps/rejected": -870.0, + "loss": 0.4509, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.03515625, + "rewards/margins": 7.5390625, + "rewards/rejected": -5.49609375, + "step": 2038 + }, + { + "epoch": 0.4045835606924947, + "grad_norm": 32.73795066808033, + "learning_rate": 7.690360779393712e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 4.30078125, + "logps/chosen": -713.0, + "logps/rejected": -1616.0, + "loss": 0.3805, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.7099609375, + "rewards/margins": 9.1953125, + "rewards/rejected": -7.4765625, + "step": 2039 + }, + { + "epoch": 0.4047819832332953, + "grad_norm": 34.28928620398274, + "learning_rate": 7.687637718037675e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.0625, + "logps/chosen": -1211.0, + "logps/rejected": -829.0, + "loss": 0.3465, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.53125, + "rewards/margins": 8.859375, + "rewards/rejected": -6.328125, + "step": 2040 + }, + { + "epoch": 0.40498040577409594, + "grad_norm": 33.888189040857476, + "learning_rate": 7.684913607310683e-07, + "logits/chosen": 4.46484375, + "logits/rejected": 4.08984375, + "logps/chosen": -964.0, + "logps/rejected": -923.5, + "loss": 0.4233, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.6474609375, + "rewards/margins": 6.5, + "rewards/rejected": -4.85546875, + "step": 2041 + }, + { + "epoch": 0.4051788283148966, + "grad_norm": 34.905006014739186, + "learning_rate": 7.682188448519448e-07, + "logits/chosen": 3.83984375, + "logits/rejected": 3.44921875, + "logps/chosen": -1239.0, + "logps/rejected": -946.0, + "loss": 0.5408, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.01318359375, + "rewards/margins": 6.04296875, + "rewards/rejected": -4.03125, + "step": 2042 + }, + { + "epoch": 0.4053772508556972, + "grad_norm": 30.428776688812906, + "learning_rate": 7.679462242971175e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.23046875, + "logps/chosen": -968.5, + "logps/rejected": -679.0, + "loss": 0.4338, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.439453125, + "rewards/margins": 6.4140625, + "rewards/rejected": -3.96484375, + "step": 2043 + }, + { + "epoch": 0.40557567339649786, + "grad_norm": 25.216771043230867, + "learning_rate": 7.676734991973579e-07, + "logits/chosen": 4.5703125, + "logits/rejected": 4.59375, + "logps/chosen": -950.0, + "logps/rejected": -751.0, + "loss": 0.3964, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.421875, + "rewards/margins": 7.7734375, + "rewards/rejected": -5.34375, + "step": 2044 + }, + { + "epoch": 0.40577409593729846, + "grad_norm": 34.48416711182546, + "learning_rate": 7.674006696834872e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.36328125, + "logps/chosen": -1173.0, + "logps/rejected": -907.0, + "loss": 0.3777, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.40234375, + "rewards/margins": 9.24609375, + "rewards/rejected": -6.8359375, + "step": 2045 + }, + { + "epoch": 0.4059725184780991, + "grad_norm": 28.36893050660215, + "learning_rate": 7.671277358863772e-07, + "logits/chosen": 3.79296875, + "logits/rejected": 4.07421875, + "logps/chosen": -1111.0, + "logps/rejected": -845.5, + "loss": 0.3613, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.181640625, + "rewards/margins": 9.2109375, + "rewards/rejected": -6.0390625, + "step": 2046 + }, + { + "epoch": 0.4061709410188997, + "grad_norm": 31.872479935269226, + "learning_rate": 7.668546979369491e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.19921875, + "logps/chosen": -1100.0, + "logps/rejected": -896.0, + "loss": 0.2976, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.615234375, + "rewards/margins": 8.078125, + "rewards/rejected": -5.47265625, + "step": 2047 + }, + { + "epoch": 0.4063693635597004, + "grad_norm": 29.3662544790992, + "learning_rate": 7.665815559661745e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 4.13671875, + "logps/chosen": -990.0, + "logps/rejected": -927.0, + "loss": 0.3656, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.1875, + "rewards/margins": 7.0546875, + "rewards/rejected": -4.85546875, + "step": 2048 + }, + { + "epoch": 0.40656778610050104, + "grad_norm": 29.36000936608504, + "learning_rate": 7.663083101050747e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 4.07421875, + "logps/chosen": -798.0, + "logps/rejected": -764.0, + "loss": 0.3228, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.40234375, + "rewards/margins": 8.0703125, + "rewards/rejected": -5.66015625, + "step": 2049 + }, + { + "epoch": 0.40676620864130164, + "grad_norm": 27.617249827378537, + "learning_rate": 7.660349604847207e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.5, + "logps/chosen": -1314.0, + "logps/rejected": -948.0, + "loss": 0.4962, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.80078125, + "rewards/margins": 7.7734375, + "rewards/rejected": -4.96484375, + "step": 2050 + }, + { + "epoch": 0.4069646311821023, + "grad_norm": 31.24958075872492, + "learning_rate": 7.657615072362337e-07, + "logits/chosen": 3.765625, + "logits/rejected": 4.16796875, + "logps/chosen": -1289.0, + "logps/rejected": -1035.0, + "loss": 0.3322, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.587890625, + "rewards/margins": 12.53125, + "rewards/rejected": -9.9296875, + "step": 2051 + }, + { + "epoch": 0.4071630537229029, + "grad_norm": 42.117804441363695, + "learning_rate": 7.654879504907845e-07, + "logits/chosen": 4.046875, + "logits/rejected": 3.8671875, + "logps/chosen": -909.0, + "logps/rejected": -669.0, + "loss": 0.4984, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.858642578125, + "rewards/margins": 4.86328125, + "rewards/rejected": -2.994140625, + "step": 2052 + }, + { + "epoch": 0.40736147626370356, + "grad_norm": 42.42545507296744, + "learning_rate": 7.652142903795932e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 4.0, + "logps/chosen": -1008.0, + "logps/rejected": -531.0, + "loss": 0.3802, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.708984375, + "rewards/margins": 5.9453125, + "rewards/rejected": -4.234375, + "step": 2053 + }, + { + "epoch": 0.4075598988045042, + "grad_norm": 32.55528353265756, + "learning_rate": 7.649405270339296e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 3.98828125, + "logps/chosen": -965.0, + "logps/rejected": -636.5, + "loss": 0.4083, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.556640625, + "rewards/margins": 6.2890625, + "rewards/rejected": -3.73046875, + "step": 2054 + }, + { + "epoch": 0.4077583213453048, + "grad_norm": 23.704878959671387, + "learning_rate": 7.646666605851134e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.08203125, + "logps/chosen": -1156.0, + "logps/rejected": -859.0, + "loss": 0.3018, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.572265625, + "rewards/margins": 7.84375, + "rewards/rejected": -5.2734375, + "step": 2055 + }, + { + "epoch": 0.4079567438861055, + "grad_norm": 32.90232654337003, + "learning_rate": 7.64392691164513e-07, + "logits/chosen": 3.83984375, + "logits/rejected": 3.72265625, + "logps/chosen": -1515.0, + "logps/rejected": -688.0, + "loss": 0.4922, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.19140625, + "rewards/margins": 3.97119140625, + "rewards/rejected": -3.78125, + "step": 2056 + }, + { + "epoch": 0.4081551664269061, + "grad_norm": 37.480407710242794, + "learning_rate": 7.641186189035472e-07, + "logits/chosen": 3.54296875, + "logits/rejected": 3.66015625, + "logps/chosen": -1088.0, + "logps/rejected": -772.0, + "loss": 0.4067, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.853515625, + "rewards/margins": 7.515625, + "rewards/rejected": -5.66796875, + "step": 2057 + }, + { + "epoch": 0.40835358896770674, + "grad_norm": 37.22796486861562, + "learning_rate": 7.638444439336832e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.40625, + "logps/chosen": -1125.0, + "logps/rejected": -1487.0, + "loss": 0.5485, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.0859375, + "rewards/margins": 9.0625, + "rewards/rejected": -6.9755859375, + "step": 2058 + }, + { + "epoch": 0.40855201150850734, + "grad_norm": 29.44887131070767, + "learning_rate": 7.635701663864381e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.16796875, + "logps/chosen": -1111.0, + "logps/rejected": -667.0, + "loss": 0.4965, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.90234375, + "rewards/margins": 6.61328125, + "rewards/rejected": -4.70703125, + "step": 2059 + }, + { + "epoch": 0.408750434049308, + "grad_norm": 37.320521872010936, + "learning_rate": 7.632957863933779e-07, + "logits/chosen": 3.65625, + "logits/rejected": 3.58203125, + "logps/chosen": -1629.0, + "logps/rejected": -604.0, + "loss": 0.4433, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2421875, + "rewards/margins": 3.15625, + "rewards/rejected": -4.39453125, + "step": 2060 + }, + { + "epoch": 0.40894885659010866, + "grad_norm": 33.658562520668625, + "learning_rate": 7.630213040861178e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.35546875, + "logps/chosen": -1148.0, + "logps/rejected": -925.0, + "loss": 0.4367, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.228515625, + "rewards/margins": 7.359375, + "rewards/rejected": -5.12109375, + "step": 2061 + }, + { + "epoch": 0.40914727913090926, + "grad_norm": 44.82993244570266, + "learning_rate": 7.627467195963222e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.046875, + "logps/chosen": -1015.0, + "logps/rejected": -1335.0, + "loss": 0.3619, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.28125, + "rewards/margins": 12.0703125, + "rewards/rejected": -9.7890625, + "step": 2062 + }, + { + "epoch": 0.4093457016717099, + "grad_norm": 52.084078355712904, + "learning_rate": 7.62472033055704e-07, + "logits/chosen": 3.7421875, + "logits/rejected": 3.92578125, + "logps/chosen": -1278.0, + "logps/rejected": -791.0, + "loss": 0.4451, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.826171875, + "rewards/margins": 7.03125, + "rewards/rejected": -5.2109375, + "step": 2063 + }, + { + "epoch": 0.4095441242125105, + "grad_norm": 31.739048339096204, + "learning_rate": 7.621972445960261e-07, + "logits/chosen": 3.75390625, + "logits/rejected": 4.2265625, + "logps/chosen": -1154.0, + "logps/rejected": -948.0, + "loss": 0.438, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.24609375, + "rewards/margins": 7.53125, + "rewards/rejected": -5.26171875, + "step": 2064 + }, + { + "epoch": 0.4097425467533112, + "grad_norm": 29.28966817813952, + "learning_rate": 7.619223543490992e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.6796875, + "logps/chosen": -1217.5, + "logps/rejected": -637.5, + "loss": 0.3666, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.416015625, + "rewards/margins": 7.3125, + "rewards/rejected": -4.90234375, + "step": 2065 + }, + { + "epoch": 0.40994096929411183, + "grad_norm": 27.93123750418427, + "learning_rate": 7.616473624467831e-07, + "logits/chosen": 4.57421875, + "logits/rejected": 4.3515625, + "logps/chosen": -1320.0, + "logps/rejected": -926.5, + "loss": 0.3143, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0625, + "rewards/margins": 7.72265625, + "rewards/rejected": -4.654296875, + "step": 2066 + }, + { + "epoch": 0.41013939183491244, + "grad_norm": 33.810804241312, + "learning_rate": 7.61372269020987e-07, + "logits/chosen": 3.7265625, + "logits/rejected": 4.09765625, + "logps/chosen": -707.5, + "logps/rejected": -617.5, + "loss": 0.5233, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.62353515625, + "rewards/margins": 4.9765625, + "rewards/rejected": -3.349609375, + "step": 2067 + }, + { + "epoch": 0.4103378143757131, + "grad_norm": 32.06816268544709, + "learning_rate": 7.610970742036679e-07, + "logits/chosen": 4.33984375, + "logits/rejected": 4.0546875, + "logps/chosen": -1226.0, + "logps/rejected": -701.0, + "loss": 0.3799, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7578125, + "rewards/margins": 7.3828125, + "rewards/rejected": -4.62890625, + "step": 2068 + }, + { + "epoch": 0.4105362369165137, + "grad_norm": 28.56165754230848, + "learning_rate": 7.608217781268324e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.30078125, + "logps/chosen": -768.0, + "logps/rejected": -616.0, + "loss": 0.472, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.28125, + "rewards/margins": 5.3515625, + "rewards/rejected": -3.07421875, + "step": 2069 + }, + { + "epoch": 0.41073465945731436, + "grad_norm": 30.312880806636127, + "learning_rate": 7.605463809225347e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.35546875, + "logps/chosen": -1351.0, + "logps/rejected": -808.5, + "loss": 0.456, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.75, + "rewards/margins": 6.546875, + "rewards/rejected": -3.7890625, + "step": 2070 + }, + { + "epoch": 0.410933081998115, + "grad_norm": 25.337853494742383, + "learning_rate": 7.602708827228779e-07, + "logits/chosen": 4.33984375, + "logits/rejected": 4.23828125, + "logps/chosen": -860.5, + "logps/rejected": -608.5, + "loss": 0.3629, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.595703125, + "rewards/margins": 6.85546875, + "rewards/rejected": -4.265625, + "step": 2071 + }, + { + "epoch": 0.4111315045389156, + "grad_norm": 30.69324982865856, + "learning_rate": 7.599952836600139e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 3.87109375, + "logps/chosen": -1021.0, + "logps/rejected": -582.5, + "loss": 0.4549, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.880859375, + "rewards/margins": 6.24609375, + "rewards/rejected": -4.375, + "step": 2072 + }, + { + "epoch": 0.4113299270797163, + "grad_norm": 29.34011443810807, + "learning_rate": 7.597195838661425e-07, + "logits/chosen": 3.6796875, + "logits/rejected": 3.9765625, + "logps/chosen": -1060.5, + "logps/rejected": -1883.0, + "loss": 0.4157, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.53515625, + "rewards/margins": 9.953125, + "rewards/rejected": -7.42578125, + "step": 2073 + }, + { + "epoch": 0.4115283496205169, + "grad_norm": 25.769219373890365, + "learning_rate": 7.59443783473512e-07, + "logits/chosen": 4.19921875, + "logits/rejected": 4.07421875, + "logps/chosen": -1382.0, + "logps/rejected": -870.0, + "loss": 0.2902, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.4140625, + "rewards/margins": 9.0703125, + "rewards/rejected": -5.65234375, + "step": 2074 + }, + { + "epoch": 0.41172677216131753, + "grad_norm": 27.63357689684919, + "learning_rate": 7.59167882614419e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.3515625, + "logps/chosen": -1307.0, + "logps/rejected": -998.0, + "loss": 0.4124, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.259765625, + "rewards/margins": 8.7734375, + "rewards/rejected": -5.5078125, + "step": 2075 + }, + { + "epoch": 0.41192519470211814, + "grad_norm": 40.34275685013929, + "learning_rate": 7.588918814212083e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 4.484375, + "logps/chosen": -1029.0, + "logps/rejected": -820.5, + "loss": 0.4672, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.10791015625, + "rewards/margins": 7.390625, + "rewards/rejected": -5.28515625, + "step": 2076 + }, + { + "epoch": 0.4121236172429188, + "grad_norm": 33.73772204847842, + "learning_rate": 7.586157800262725e-07, + "logits/chosen": 4.4375, + "logits/rejected": 4.21875, + "logps/chosen": -1554.0, + "logps/rejected": -827.0, + "loss": 0.3689, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.41796875, + "rewards/margins": 8.4375, + "rewards/rejected": -6.02734375, + "step": 2077 + }, + { + "epoch": 0.41232203978371945, + "grad_norm": 32.21327559030334, + "learning_rate": 7.583395785620527e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.06640625, + "logps/chosen": -874.0, + "logps/rejected": -589.5, + "loss": 0.3558, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.2744140625, + "rewards/margins": 7.25, + "rewards/rejected": -4.97265625, + "step": 2078 + }, + { + "epoch": 0.41252046232452005, + "grad_norm": 26.077485185066415, + "learning_rate": 7.580632771610381e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.19921875, + "logps/chosen": -1217.0, + "logps/rejected": -868.0, + "loss": 0.4195, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.951171875, + "rewards/margins": 8.9296875, + "rewards/rejected": -5.99609375, + "step": 2079 + }, + { + "epoch": 0.4127188848653207, + "grad_norm": 30.00948242085594, + "learning_rate": 7.577868759557653e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 3.9765625, + "logps/chosen": -816.5, + "logps/rejected": -1215.0, + "loss": 0.4818, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.62890625, + "rewards/margins": 7.75390625, + "rewards/rejected": -6.134765625, + "step": 2080 + }, + { + "epoch": 0.4129173074061213, + "grad_norm": 31.904871767366046, + "learning_rate": 7.575103750788188e-07, + "logits/chosen": 3.984375, + "logits/rejected": 3.9140625, + "logps/chosen": -910.0, + "logps/rejected": -768.0, + "loss": 0.4695, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.208984375, + "rewards/margins": 6.6484375, + "rewards/rejected": -4.44921875, + "step": 2081 + }, + { + "epoch": 0.413115729946922, + "grad_norm": 30.586630779695774, + "learning_rate": 7.572337746628318e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.11328125, + "logps/chosen": -748.0, + "logps/rejected": -593.0, + "loss": 0.4324, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.80859375, + "rewards/margins": 6.609375, + "rewards/rejected": -3.80078125, + "step": 2082 + }, + { + "epoch": 0.41331415248772263, + "grad_norm": 37.13465568773286, + "learning_rate": 7.569570748404841e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.05078125, + "logps/chosen": -938.5, + "logps/rejected": -603.0, + "loss": 0.3309, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.173828125, + "rewards/margins": 7.09375, + "rewards/rejected": -4.93359375, + "step": 2083 + }, + { + "epoch": 0.41351257502852323, + "grad_norm": 29.566589116704296, + "learning_rate": 7.566802757445038e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.24609375, + "logps/chosen": -824.0, + "logps/rejected": -602.5, + "loss": 0.4401, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.072265625, + "rewards/margins": 6.84375, + "rewards/rejected": -4.76953125, + "step": 2084 + }, + { + "epoch": 0.4137109975693239, + "grad_norm": 36.61675810395284, + "learning_rate": 7.564033775076666e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 3.9140625, + "logps/chosen": -1051.0, + "logps/rejected": -610.5, + "loss": 0.4492, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.7470703125, + "rewards/margins": 8.390625, + "rewards/rejected": -6.640625, + "step": 2085 + }, + { + "epoch": 0.4139094201101245, + "grad_norm": 30.85885894800469, + "learning_rate": 7.561263802627955e-07, + "logits/chosen": 4.14453125, + "logits/rejected": 4.05078125, + "logps/chosen": -1275.0, + "logps/rejected": -733.0, + "loss": 0.4356, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.94970703125, + "rewards/margins": 7.375, + "rewards/rejected": -5.421875, + "step": 2086 + }, + { + "epoch": 0.41410784265092515, + "grad_norm": 40.89765004726484, + "learning_rate": 7.558492841427612e-07, + "logits/chosen": 3.75390625, + "logits/rejected": 3.63671875, + "logps/chosen": -1136.0, + "logps/rejected": -827.5, + "loss": 0.3985, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.72802734375, + "rewards/margins": 7.546875, + "rewards/rejected": -5.8125, + "step": 2087 + }, + { + "epoch": 0.41430626519172575, + "grad_norm": 34.24655377333938, + "learning_rate": 7.555720892804822e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 3.71875, + "logps/chosen": -817.0, + "logps/rejected": -630.5, + "loss": 0.4343, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.469970703125, + "rewards/margins": 6.078125, + "rewards/rejected": -4.61328125, + "step": 2088 + }, + { + "epoch": 0.4145046877325264, + "grad_norm": 27.964865816046498, + "learning_rate": 7.552947958089233e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 4.00390625, + "logps/chosen": -840.0, + "logps/rejected": -811.0, + "loss": 0.3117, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.3486328125, + "rewards/margins": 8.7421875, + "rewards/rejected": -6.3984375, + "step": 2089 + }, + { + "epoch": 0.41470311027332707, + "grad_norm": 30.058648410361602, + "learning_rate": 7.550174038610977e-07, + "logits/chosen": 4.38671875, + "logits/rejected": 4.30859375, + "logps/chosen": -869.5, + "logps/rejected": -691.5, + "loss": 0.349, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.318359375, + "rewards/margins": 7.625, + "rewards/rejected": -5.3125, + "step": 2090 + }, + { + "epoch": 0.41490153281412767, + "grad_norm": 35.64449591117496, + "learning_rate": 7.547399135700651e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.23046875, + "logps/chosen": -959.5, + "logps/rejected": -1655.0, + "loss": 0.4114, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.5966796875, + "rewards/margins": 9.375, + "rewards/rejected": -7.7734375, + "step": 2091 + }, + { + "epoch": 0.41509995535492833, + "grad_norm": 30.43660862709175, + "learning_rate": 7.544623250689329e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.140625, + "logps/chosen": -1250.0, + "logps/rejected": -821.0, + "loss": 0.3241, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.90234375, + "rewards/margins": 7.2734375, + "rewards/rejected": -4.3828125, + "step": 2092 + }, + { + "epoch": 0.41529837789572893, + "grad_norm": 31.024285844489278, + "learning_rate": 7.541846384908554e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 3.8828125, + "logps/chosen": -1105.0, + "logps/rejected": -574.0, + "loss": 0.5064, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7734375, + "rewards/margins": 5.8125, + "rewards/rejected": -3.04296875, + "step": 2093 + }, + { + "epoch": 0.4154968004365296, + "grad_norm": 30.534854614319638, + "learning_rate": 7.539068539690336e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.30078125, + "logps/chosen": -949.5, + "logps/rejected": -739.0, + "loss": 0.4237, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.77734375, + "rewards/margins": 7.609375, + "rewards/rejected": -4.8125, + "step": 2094 + }, + { + "epoch": 0.41569522297733025, + "grad_norm": 37.78375758996652, + "learning_rate": 7.536289716367163e-07, + "logits/chosen": 3.61328125, + "logits/rejected": 3.60546875, + "logps/chosen": -1361.0, + "logps/rejected": -777.0, + "loss": 0.3696, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.259765625, + "rewards/margins": 7.94921875, + "rewards/rejected": -4.671875, + "step": 2095 + }, + { + "epoch": 0.41589364551813085, + "grad_norm": 33.673600610029084, + "learning_rate": 7.533509916271981e-07, + "logits/chosen": 4.4375, + "logits/rejected": 4.1953125, + "logps/chosen": -1012.5, + "logps/rejected": -911.0, + "loss": 0.3797, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.818359375, + "rewards/margins": 7.28125, + "rewards/rejected": -5.44921875, + "step": 2096 + }, + { + "epoch": 0.4160920680589315, + "grad_norm": 29.252175247861867, + "learning_rate": 7.530729140738216e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.01953125, + "logps/chosen": -804.0, + "logps/rejected": -613.0, + "loss": 0.4705, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.54296875, + "rewards/margins": 5.03515625, + "rewards/rejected": -3.4921875, + "step": 2097 + }, + { + "epoch": 0.4162904905997321, + "grad_norm": 37.21202866451215, + "learning_rate": 7.527947391099754e-07, + "logits/chosen": 3.96875, + "logits/rejected": 4.29296875, + "logps/chosen": -961.0, + "logps/rejected": -789.0, + "loss": 0.4962, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.21142578125, + "rewards/margins": 6.01171875, + "rewards/rejected": -4.80078125, + "step": 2098 + }, + { + "epoch": 0.41648891314053277, + "grad_norm": 28.20242728071759, + "learning_rate": 7.52516466869095e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 4.23046875, + "logps/chosen": -896.0, + "logps/rejected": -655.0, + "loss": 0.3422, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.99609375, + "rewards/margins": 5.78125, + "rewards/rejected": -3.78515625, + "step": 2099 + }, + { + "epoch": 0.4166873356813334, + "grad_norm": 37.64859679971383, + "learning_rate": 7.522380974846629e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.390625, + "logps/chosen": -746.0, + "logps/rejected": -756.0, + "loss": 0.4275, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.267578125, + "rewards/margins": 5.75390625, + "rewards/rejected": -3.49609375, + "step": 2100 + }, + { + "epoch": 0.41688575822213403, + "grad_norm": 37.06455239905786, + "learning_rate": 7.51959631090208e-07, + "logits/chosen": 3.57421875, + "logits/rejected": 3.921875, + "logps/chosen": -848.0, + "logps/rejected": -1679.5, + "loss": 0.453, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.978515625, + "rewards/margins": 10.3203125, + "rewards/rejected": -8.3125, + "step": 2101 + }, + { + "epoch": 0.4170841807629347, + "grad_norm": 32.09847694333356, + "learning_rate": 7.516810678193053e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.0546875, + "logps/chosen": -1438.0, + "logps/rejected": -900.0, + "loss": 0.3584, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.166015625, + "rewards/margins": 7.9375, + "rewards/rejected": -5.765625, + "step": 2102 + }, + { + "epoch": 0.4172826033037353, + "grad_norm": 34.40777901641294, + "learning_rate": 7.514024078055771e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 4.16015625, + "logps/chosen": -781.0, + "logps/rejected": -1434.5, + "loss": 0.4901, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.509765625, + "rewards/margins": 7.080078125, + "rewards/rejected": -5.576171875, + "step": 2103 + }, + { + "epoch": 0.41748102584453595, + "grad_norm": 40.08930232750686, + "learning_rate": 7.511236511826913e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 4.08984375, + "logps/chosen": -962.0, + "logps/rejected": -882.0, + "loss": 0.3846, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.55078125, + "rewards/margins": 8.109375, + "rewards/rejected": -5.56640625, + "step": 2104 + }, + { + "epoch": 0.41767944838533655, + "grad_norm": 26.686614433494807, + "learning_rate": 7.508447980843626e-07, + "logits/chosen": 4.40234375, + "logits/rejected": 4.8203125, + "logps/chosen": -817.5, + "logps/rejected": -877.0, + "loss": 0.4553, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7353515625, + "rewards/margins": 8.7265625, + "rewards/rejected": -6.0, + "step": 2105 + }, + { + "epoch": 0.4178778709261372, + "grad_norm": 31.82170493996776, + "learning_rate": 7.50565848644352e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 3.98046875, + "logps/chosen": -994.0, + "logps/rejected": -764.0, + "loss": 0.4448, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9794921875, + "rewards/margins": 6.46875, + "rewards/rejected": -4.484375, + "step": 2106 + }, + { + "epoch": 0.41807629346693786, + "grad_norm": 31.67844427671455, + "learning_rate": 7.502868029964664e-07, + "logits/chosen": 4.5859375, + "logits/rejected": 4.4609375, + "logps/chosen": -744.0, + "logps/rejected": -521.5, + "loss": 0.4851, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.943359375, + "rewards/margins": 5.07421875, + "rewards/rejected": -3.13671875, + "step": 2107 + }, + { + "epoch": 0.41827471600773847, + "grad_norm": 42.75397768188397, + "learning_rate": 7.500076612745593e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.2734375, + "logps/chosen": -866.0, + "logps/rejected": -886.5, + "loss": 0.504, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6953125, + "rewards/margins": 5.46875, + "rewards/rejected": -3.76953125, + "step": 2108 + }, + { + "epoch": 0.4184731385485391, + "grad_norm": 33.68282882952191, + "learning_rate": 7.497284236125299e-07, + "logits/chosen": 4.23046875, + "logits/rejected": 4.6015625, + "logps/chosen": -1008.0, + "logps/rejected": -1429.0, + "loss": 0.4521, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.0078125, + "rewards/margins": 8.59375, + "rewards/rejected": -6.609375, + "step": 2109 + }, + { + "epoch": 0.4186715610893397, + "grad_norm": 29.733602543746663, + "learning_rate": 7.494490901443236e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 4.09375, + "logps/chosen": -1354.0, + "logps/rejected": -1050.0, + "loss": 0.3488, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.00390625, + "rewards/margins": 8.84375, + "rewards/rejected": -5.84375, + "step": 2110 + }, + { + "epoch": 0.4188699836301404, + "grad_norm": 33.88365414580279, + "learning_rate": 7.491696610039314e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 3.7265625, + "logps/chosen": -1042.0, + "logps/rejected": -609.0, + "loss": 0.3835, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.53125, + "rewards/margins": 6.8671875, + "rewards/rejected": -4.33984375, + "step": 2111 + }, + { + "epoch": 0.41906840617094104, + "grad_norm": 30.493544755186303, + "learning_rate": 7.488901363253908e-07, + "logits/chosen": 3.8125, + "logits/rejected": 4.05859375, + "logps/chosen": -891.0, + "logps/rejected": -953.0, + "loss": 0.4182, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.9130859375, + "rewards/margins": 7.6328125, + "rewards/rejected": -5.720703125, + "step": 2112 + }, + { + "epoch": 0.41926682871174165, + "grad_norm": 35.04880725155934, + "learning_rate": 7.486105162427847e-07, + "logits/chosen": 4.375, + "logits/rejected": 3.90234375, + "logps/chosen": -1614.0, + "logps/rejected": -809.0, + "loss": 0.5252, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.015625, + "rewards/margins": 6.85546875, + "rewards/rejected": -5.8203125, + "step": 2113 + }, + { + "epoch": 0.4194652512525423, + "grad_norm": 34.059798307261765, + "learning_rate": 7.483308008902421e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.23828125, + "logps/chosen": -1074.0, + "logps/rejected": -993.0, + "loss": 0.3817, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.2578125, + "rewards/margins": 8.5859375, + "rewards/rejected": -6.31640625, + "step": 2114 + }, + { + "epoch": 0.4196636737933429, + "grad_norm": 28.51718661658161, + "learning_rate": 7.480509904019371e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.3671875, + "logps/chosen": -673.5, + "logps/rejected": -723.0, + "loss": 0.4862, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.716796875, + "rewards/margins": 6.82421875, + "rewards/rejected": -5.11328125, + "step": 2115 + }, + { + "epoch": 0.41986209633414356, + "grad_norm": 35.327181985801985, + "learning_rate": 7.477710849120902e-07, + "logits/chosen": 4.09375, + "logits/rejected": 3.7265625, + "logps/chosen": -1290.0, + "logps/rejected": -717.0, + "loss": 0.2938, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.47265625, + "rewards/margins": 8.60546875, + "rewards/rejected": -6.140625, + "step": 2116 + }, + { + "epoch": 0.42006051887494417, + "grad_norm": 31.88679049075864, + "learning_rate": 7.474910845549668e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 3.85546875, + "logps/chosen": -943.0, + "logps/rejected": -638.5, + "loss": 0.455, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.330078125, + "rewards/margins": 6.640625, + "rewards/rejected": -4.3046875, + "step": 2117 + }, + { + "epoch": 0.4202589414157448, + "grad_norm": 30.88039652443602, + "learning_rate": 7.472109894648783e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.37890625, + "logps/chosen": -812.0, + "logps/rejected": -810.0, + "loss": 0.427, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.86328125, + "rewards/margins": 6.46875, + "rewards/rejected": -3.59765625, + "step": 2118 + }, + { + "epoch": 0.4204573639565455, + "grad_norm": 37.83323608988224, + "learning_rate": 7.469307997761813e-07, + "logits/chosen": 3.44921875, + "logits/rejected": 3.63671875, + "logps/chosen": -1284.0, + "logps/rejected": -899.0, + "loss": 0.3771, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.529296875, + "rewards/margins": 7.453125, + "rewards/rejected": -4.9296875, + "step": 2119 + }, + { + "epoch": 0.4206557864973461, + "grad_norm": 36.680367348185634, + "learning_rate": 7.466505156232777e-07, + "logits/chosen": 4.22265625, + "logits/rejected": 4.2578125, + "logps/chosen": -1190.0, + "logps/rejected": -807.0, + "loss": 0.4486, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.810546875, + "rewards/margins": 7.21875, + "rewards/rejected": -5.40625, + "step": 2120 + }, + { + "epoch": 0.42085420903814674, + "grad_norm": 24.043116369295827, + "learning_rate": 7.463701371406151e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 3.6875, + "logps/chosen": -985.0, + "logps/rejected": -658.0, + "loss": 0.3284, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.36328125, + "rewards/margins": 7.6796875, + "rewards/rejected": -5.3125, + "step": 2121 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 35.18147061938797, + "learning_rate": 7.460896644626857e-07, + "logits/chosen": 4.6015625, + "logits/rejected": 4.5390625, + "logps/chosen": -728.5, + "logps/rejected": -545.0, + "loss": 0.4663, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.23046875, + "rewards/margins": 6.1484375, + "rewards/rejected": -3.919921875, + "step": 2122 + }, + { + "epoch": 0.421251054119748, + "grad_norm": 34.32943659074894, + "learning_rate": 7.458090977240274e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.98046875, + "logps/chosen": -1054.0, + "logps/rejected": -2012.5, + "loss": 0.3672, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.625, + "rewards/margins": 11.6953125, + "rewards/rejected": -9.07421875, + "step": 2123 + }, + { + "epoch": 0.42144947666054866, + "grad_norm": 38.13556875320929, + "learning_rate": 7.455284370592233e-07, + "logits/chosen": 4.234375, + "logits/rejected": 3.99609375, + "logps/chosen": -985.0, + "logps/rejected": -739.0, + "loss": 0.4167, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.650390625, + "rewards/margins": 6.9140625, + "rewards/rejected": -5.265625, + "step": 2124 + }, + { + "epoch": 0.42164789920134926, + "grad_norm": 39.02834882475495, + "learning_rate": 7.45247682602901e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.328125, + "logps/chosen": -775.0, + "logps/rejected": -718.0, + "loss": 0.4804, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.4443359375, + "rewards/margins": 5.7890625, + "rewards/rejected": -4.3515625, + "step": 2125 + }, + { + "epoch": 0.4218463217421499, + "grad_norm": 27.46169760144338, + "learning_rate": 7.449668344897338e-07, + "logits/chosen": 4.37890625, + "logits/rejected": 4.3125, + "logps/chosen": -1114.5, + "logps/rejected": -713.0, + "loss": 0.3121, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.66796875, + "rewards/margins": 6.6875, + "rewards/rejected": -4.02734375, + "step": 2126 + }, + { + "epoch": 0.4220447442829505, + "grad_norm": 36.05155905006307, + "learning_rate": 7.446858928544392e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.4140625, + "logps/chosen": -1281.0, + "logps/rejected": -1665.0, + "loss": 0.348, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6171875, + "rewards/margins": 9.8984375, + "rewards/rejected": -7.25390625, + "step": 2127 + }, + { + "epoch": 0.4222431668237512, + "grad_norm": 26.125745842785918, + "learning_rate": 7.444048578317802e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.33984375, + "logps/chosen": -1077.0, + "logps/rejected": -748.5, + "loss": 0.4072, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6953125, + "rewards/margins": 6.21875, + "rewards/rejected": -3.533203125, + "step": 2128 + }, + { + "epoch": 0.42244158936455184, + "grad_norm": 42.23626587014711, + "learning_rate": 7.441237295565641e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.2890625, + "logps/chosen": -1013.0, + "logps/rejected": -647.0, + "loss": 0.5354, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.353515625, + "rewards/margins": 6.0625, + "rewards/rejected": -4.7109375, + "step": 2129 + }, + { + "epoch": 0.42264001190535244, + "grad_norm": 29.88026238637349, + "learning_rate": 7.438425081636432e-07, + "logits/chosen": 3.71484375, + "logits/rejected": 3.765625, + "logps/chosen": -760.0, + "logps/rejected": -506.5, + "loss": 0.5414, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.830078125, + "rewards/margins": 4.50390625, + "rewards/rejected": -2.66796875, + "step": 2130 + }, + { + "epoch": 0.4228384344461531, + "grad_norm": 29.76313139634026, + "learning_rate": 7.435611937879143e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.73828125, + "logps/chosen": -1298.0, + "logps/rejected": -775.0, + "loss": 0.386, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.57421875, + "rewards/margins": 7.9921875, + "rewards/rejected": -5.4296875, + "step": 2131 + }, + { + "epoch": 0.4230368569869537, + "grad_norm": 34.624103437361555, + "learning_rate": 7.432797865643191e-07, + "logits/chosen": 4.5390625, + "logits/rejected": 4.390625, + "logps/chosen": -1002.0, + "logps/rejected": -653.0, + "loss": 0.3966, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.455078125, + "rewards/margins": 7.1171875, + "rewards/rejected": -4.6640625, + "step": 2132 + }, + { + "epoch": 0.42323527952775436, + "grad_norm": 32.94551955257026, + "learning_rate": 7.429982866278434e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.25, + "logps/chosen": -1027.0, + "logps/rejected": -601.5, + "loss": 0.3727, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.7177734375, + "rewards/margins": 6.5703125, + "rewards/rejected": -4.86328125, + "step": 2133 + }, + { + "epoch": 0.42343370206855496, + "grad_norm": 38.3874735912703, + "learning_rate": 7.427166941135181e-07, + "logits/chosen": 4.05078125, + "logits/rejected": 4.09375, + "logps/chosen": -675.0, + "logps/rejected": -711.5, + "loss": 0.5775, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.4765625, + "rewards/margins": 6.44140625, + "rewards/rejected": -4.96484375, + "step": 2134 + }, + { + "epoch": 0.4236321246093556, + "grad_norm": 34.670659199162344, + "learning_rate": 7.42435009156418e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.1796875, + "logps/chosen": -849.0, + "logps/rejected": -798.0, + "loss": 0.3496, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.146484375, + "rewards/margins": 7.765625, + "rewards/rejected": -5.6171875, + "step": 2135 + }, + { + "epoch": 0.4238305471501563, + "grad_norm": 39.9132155975581, + "learning_rate": 7.421532318916621e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.1328125, + "logps/chosen": -1166.0, + "logps/rejected": -1131.0, + "loss": 0.4234, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.029296875, + "rewards/margins": 10.2109375, + "rewards/rejected": -8.203125, + "step": 2136 + }, + { + "epoch": 0.4240289696909569, + "grad_norm": 41.79367899023375, + "learning_rate": 7.418713624544143e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.3046875, + "logps/chosen": -898.5, + "logps/rejected": -677.5, + "loss": 0.4655, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.30078125, + "rewards/margins": 7.4453125, + "rewards/rejected": -5.1484375, + "step": 2137 + }, + { + "epoch": 0.42422739223175754, + "grad_norm": 26.85472468953351, + "learning_rate": 7.41589400979882e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 4.0546875, + "logps/chosen": -713.0, + "logps/rejected": -591.5, + "loss": 0.5548, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.0660400390625, + "rewards/margins": 5.82421875, + "rewards/rejected": -4.7578125, + "step": 2138 + }, + { + "epoch": 0.42442581477255814, + "grad_norm": 30.957595996780846, + "learning_rate": 7.413073476033173e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 3.984375, + "logps/chosen": -1114.0, + "logps/rejected": -789.0, + "loss": 0.4472, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.255859375, + "rewards/margins": 7.234375, + "rewards/rejected": -4.984375, + "step": 2139 + }, + { + "epoch": 0.4246242373133588, + "grad_norm": 43.4720922465963, + "learning_rate": 7.410252024600163e-07, + "logits/chosen": 3.546875, + "logits/rejected": 3.28125, + "logps/chosen": -1179.0, + "logps/rejected": -765.0, + "loss": 0.3777, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9072265625, + "rewards/margins": 8.1640625, + "rewards/rejected": -6.25, + "step": 2140 + }, + { + "epoch": 0.42482265985415946, + "grad_norm": 28.382226237347425, + "learning_rate": 7.40742965685319e-07, + "logits/chosen": 3.6640625, + "logits/rejected": 3.375, + "logps/chosen": -1344.0, + "logps/rejected": -840.0, + "loss": 0.3482, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.1796875, + "rewards/margins": 8.78125, + "rewards/rejected": -6.6171875, + "step": 2141 + }, + { + "epoch": 0.42502108239496006, + "grad_norm": 29.48790842399322, + "learning_rate": 7.404606374146092e-07, + "logits/chosen": 3.6796875, + "logits/rejected": 3.50390625, + "logps/chosen": -1024.0, + "logps/rejected": -738.0, + "loss": 0.468, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.6611328125, + "rewards/margins": 6.4453125, + "rewards/rejected": -4.796875, + "step": 2142 + }, + { + "epoch": 0.4252195049357607, + "grad_norm": 34.87778357697224, + "learning_rate": 7.401782177833147e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 3.93359375, + "logps/chosen": -857.0, + "logps/rejected": -648.5, + "loss": 0.3429, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.26171875, + "rewards/margins": 8.1015625, + "rewards/rejected": -5.84375, + "step": 2143 + }, + { + "epoch": 0.4254179274765613, + "grad_norm": 38.20194883728292, + "learning_rate": 7.398957069269074e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.18359375, + "logps/chosen": -729.0, + "logps/rejected": -544.0, + "loss": 0.5092, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.01708984375, + "rewards/margins": 5.1640625, + "rewards/rejected": -4.15234375, + "step": 2144 + }, + { + "epoch": 0.425616350017362, + "grad_norm": 38.02936745804477, + "learning_rate": 7.396131049809027e-07, + "logits/chosen": 3.73828125, + "logits/rejected": 4.12109375, + "logps/chosen": -1009.0, + "logps/rejected": -1540.0, + "loss": 0.6646, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.72705078125, + "rewards/margins": 6.47265625, + "rewards/rejected": -5.75390625, + "step": 2145 + }, + { + "epoch": 0.4258147725581626, + "grad_norm": 30.479971431965748, + "learning_rate": 7.393304120808597e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 4.19921875, + "logps/chosen": -1164.0, + "logps/rejected": -723.0, + "loss": 0.3849, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.986328125, + "rewards/margins": 7.1171875, + "rewards/rejected": -5.125, + "step": 2146 + }, + { + "epoch": 0.42601319509896324, + "grad_norm": 28.02120051623961, + "learning_rate": 7.390476283623813e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.109375, + "logps/chosen": -1214.0, + "logps/rejected": -745.0, + "loss": 0.4684, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.0377197265625, + "rewards/margins": 5.76171875, + "rewards/rejected": -3.724609375, + "step": 2147 + }, + { + "epoch": 0.4262116176397639, + "grad_norm": 30.71199697948726, + "learning_rate": 7.387647539611134e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 3.56640625, + "logps/chosen": -944.0, + "logps/rejected": -615.5, + "loss": 0.3449, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5771484375, + "rewards/margins": 8.1796875, + "rewards/rejected": -5.625, + "step": 2148 + }, + { + "epoch": 0.4264100401805645, + "grad_norm": 36.539489981429114, + "learning_rate": 7.384817890127462e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.640625, + "logps/chosen": -889.0, + "logps/rejected": -1252.0, + "loss": 0.5271, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.34765625, + "rewards/margins": 7.796875, + "rewards/rejected": -6.46875, + "step": 2149 + }, + { + "epoch": 0.42660846272136516, + "grad_norm": 37.64900948286192, + "learning_rate": 7.381987336530128e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 4.01171875, + "logps/chosen": -636.5, + "logps/rejected": -696.0, + "loss": 0.5833, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.0234375, + "rewards/margins": 5.255859375, + "rewards/rejected": -4.23828125, + "step": 2150 + }, + { + "epoch": 0.42680688526216576, + "grad_norm": 41.794059421151935, + "learning_rate": 7.3791558801769e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 4.23046875, + "logps/chosen": -827.0, + "logps/rejected": -706.0, + "loss": 0.5547, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.33056640625, + "rewards/margins": 5.49462890625, + "rewards/rejected": -4.165283203125, + "step": 2151 + }, + { + "epoch": 0.4270053078029664, + "grad_norm": 38.38438432559141, + "learning_rate": 7.376323522425976e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.00390625, + "logps/chosen": -1229.0, + "logps/rejected": -992.0, + "loss": 0.4197, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.580078125, + "rewards/margins": 7.171875, + "rewards/rejected": -4.60546875, + "step": 2152 + }, + { + "epoch": 0.4272037303437671, + "grad_norm": 40.83737610749968, + "learning_rate": 7.373490264635989e-07, + "logits/chosen": 3.828125, + "logits/rejected": 3.953125, + "logps/chosen": -945.0, + "logps/rejected": -1026.0, + "loss": 0.4153, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.447265625, + "rewards/margins": 16.28125, + "rewards/rejected": -14.859375, + "step": 2153 + }, + { + "epoch": 0.4274021528845677, + "grad_norm": 34.50201631602115, + "learning_rate": 7.370656108165999e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 4.16015625, + "logps/chosen": -949.5, + "logps/rejected": -1228.0, + "loss": 0.4828, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.884765625, + "rewards/margins": 6.7109375, + "rewards/rejected": -4.828125, + "step": 2154 + }, + { + "epoch": 0.42760057542536833, + "grad_norm": 40.576743353397, + "learning_rate": 7.367821054375504e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 4.28125, + "logps/chosen": -1066.0, + "logps/rejected": -819.0, + "loss": 0.5492, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.845703125, + "rewards/margins": 5.3359375, + "rewards/rejected": -3.484375, + "step": 2155 + }, + { + "epoch": 0.42779899796616894, + "grad_norm": 35.27044029460446, + "learning_rate": 7.364985104624428e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.44140625, + "logps/chosen": -1094.0, + "logps/rejected": -1063.0, + "loss": 0.3328, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.98828125, + "rewards/margins": 8.3671875, + "rewards/rejected": -5.37890625, + "step": 2156 + }, + { + "epoch": 0.4279974205069696, + "grad_norm": 31.864275444032053, + "learning_rate": 7.362148260273126e-07, + "logits/chosen": 3.5078125, + "logits/rejected": 3.61328125, + "logps/chosen": -1179.0, + "logps/rejected": -820.0, + "loss": 0.4379, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9375, + "rewards/margins": 6.05078125, + "rewards/rejected": -4.10546875, + "step": 2157 + }, + { + "epoch": 0.42819584304777025, + "grad_norm": 39.91723981054519, + "learning_rate": 7.359310522682381e-07, + "logits/chosen": 4.515625, + "logits/rejected": 4.28125, + "logps/chosen": -902.0, + "logps/rejected": -723.0, + "loss": 0.4067, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4833984375, + "rewards/margins": 6.375, + "rewards/rejected": -3.89453125, + "step": 2158 + }, + { + "epoch": 0.42839426558857086, + "grad_norm": 35.47750651838704, + "learning_rate": 7.356471893213406e-07, + "logits/chosen": 4.328125, + "logits/rejected": 4.26953125, + "logps/chosen": -1128.0, + "logps/rejected": -852.0, + "loss": 0.441, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.94140625, + "rewards/margins": 5.8515625, + "rewards/rejected": -2.903564453125, + "step": 2159 + }, + { + "epoch": 0.4285926881293715, + "grad_norm": 35.2209127915584, + "learning_rate": 7.353632373227841e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.91015625, + "logps/chosen": -1179.0, + "logps/rejected": -934.0, + "loss": 0.4537, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.435546875, + "rewards/margins": 6.8046875, + "rewards/rejected": -4.375, + "step": 2160 + }, + { + "epoch": 0.4287911106701721, + "grad_norm": 40.326052014301744, + "learning_rate": 7.350791964087752e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.25, + "logps/chosen": -888.5, + "logps/rejected": -604.0, + "loss": 0.4359, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.8017578125, + "rewards/margins": 5.4296875, + "rewards/rejected": -3.63671875, + "step": 2161 + }, + { + "epoch": 0.4289895332109728, + "grad_norm": 38.779368296799554, + "learning_rate": 7.347950667155636e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 4.3046875, + "logps/chosen": -1040.0, + "logps/rejected": -800.0, + "loss": 0.4708, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.294921875, + "rewards/margins": 5.80078125, + "rewards/rejected": -3.50390625, + "step": 2162 + }, + { + "epoch": 0.4291879557517734, + "grad_norm": 30.71485772882016, + "learning_rate": 7.345108483794408e-07, + "logits/chosen": 3.46875, + "logits/rejected": 3.5859375, + "logps/chosen": -941.0, + "logps/rejected": -771.0, + "loss": 0.3117, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.02734375, + "rewards/margins": 7.6796875, + "rewards/rejected": -4.66015625, + "step": 2163 + }, + { + "epoch": 0.42938637829257403, + "grad_norm": 33.226304444011916, + "learning_rate": 7.342265415367416e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 4.1640625, + "logps/chosen": -674.0, + "logps/rejected": -627.5, + "loss": 0.5145, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.697265625, + "rewards/margins": 5.5859375, + "rewards/rejected": -3.88671875, + "step": 2164 + }, + { + "epoch": 0.4295848008333747, + "grad_norm": 30.126273166453977, + "learning_rate": 7.33942146323843e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 3.92578125, + "logps/chosen": -981.0, + "logps/rejected": -930.0, + "loss": 0.409, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.291015625, + "rewards/margins": 7.69140625, + "rewards/rejected": -5.390625, + "step": 2165 + }, + { + "epoch": 0.4297832233741753, + "grad_norm": 29.913872645728134, + "learning_rate": 7.336576628771641e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.1875, + "logps/chosen": -1340.0, + "logps/rejected": -805.0, + "loss": 0.37, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7265625, + "rewards/margins": 8.4296875, + "rewards/rejected": -5.71484375, + "step": 2166 + }, + { + "epoch": 0.42998164591497595, + "grad_norm": 43.79846743790707, + "learning_rate": 7.333730913331668e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 3.57421875, + "logps/chosen": -794.0, + "logps/rejected": -510.0, + "loss": 0.5203, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.22705078125, + "rewards/margins": 5.9296875, + "rewards/rejected": -4.6953125, + "step": 2167 + }, + { + "epoch": 0.43018006845577655, + "grad_norm": 30.242677567993663, + "learning_rate": 7.330884318283549e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.16015625, + "logps/chosen": -1311.0, + "logps/rejected": -858.0, + "loss": 0.2885, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4912109375, + "rewards/margins": 9.3125, + "rewards/rejected": -6.828125, + "step": 2168 + }, + { + "epoch": 0.4303784909965772, + "grad_norm": 31.983671499886157, + "learning_rate": 7.328036844992745e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.11328125, + "logps/chosen": -1121.0, + "logps/rejected": -852.0, + "loss": 0.4433, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.077392578125, + "rewards/margins": 6.9296875, + "rewards/rejected": -4.859375, + "step": 2169 + }, + { + "epoch": 0.43057691353737787, + "grad_norm": 36.107862044778194, + "learning_rate": 7.325188494825138e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.02734375, + "logps/chosen": -922.0, + "logps/rejected": -592.5, + "loss": 0.4278, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.8515625, + "rewards/margins": 7.0234375, + "rewards/rejected": -5.16796875, + "step": 2170 + }, + { + "epoch": 0.4307753360781785, + "grad_norm": 45.44617419311703, + "learning_rate": 7.322339269147031e-07, + "logits/chosen": 3.83984375, + "logits/rejected": 4.0859375, + "logps/chosen": -1031.5, + "logps/rejected": -824.0, + "loss": 0.4008, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.6689453125, + "rewards/margins": 7.7421875, + "rewards/rejected": -6.06640625, + "step": 2171 + }, + { + "epoch": 0.43097375861897913, + "grad_norm": 34.134677470274084, + "learning_rate": 7.319489169325148e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.16015625, + "logps/chosen": -1004.0, + "logps/rejected": -902.0, + "loss": 0.427, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.966796875, + "rewards/margins": 7.34375, + "rewards/rejected": -5.375, + "step": 2172 + }, + { + "epoch": 0.43117218115977973, + "grad_norm": 27.84239608516989, + "learning_rate": 7.31663819672663e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 3.9453125, + "logps/chosen": -994.0, + "logps/rejected": -897.0, + "loss": 0.2476, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.537109375, + "rewards/margins": 10.765625, + "rewards/rejected": -8.234375, + "step": 2173 + }, + { + "epoch": 0.4313706037005804, + "grad_norm": 39.04876113998148, + "learning_rate": 7.313786352719038e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 3.8671875, + "logps/chosen": -1071.0, + "logps/rejected": -771.0, + "loss": 0.3467, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.537109375, + "rewards/margins": 8.875, + "rewards/rejected": -6.3359375, + "step": 2174 + }, + { + "epoch": 0.43156902624138105, + "grad_norm": 34.897037183659876, + "learning_rate": 7.310933638670352e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.1796875, + "logps/chosen": -1229.0, + "logps/rejected": -790.0, + "loss": 0.3231, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.046875, + "rewards/margins": 8.15625, + "rewards/rejected": -5.109375, + "step": 2175 + }, + { + "epoch": 0.43176744878218165, + "grad_norm": 32.99133248550364, + "learning_rate": 7.308080055948968e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 4.3828125, + "logps/chosen": -1006.0, + "logps/rejected": -606.0, + "loss": 0.3482, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.31640625, + "rewards/margins": 7.546875, + "rewards/rejected": -5.22265625, + "step": 2176 + }, + { + "epoch": 0.4319658713229823, + "grad_norm": 28.688142043784676, + "learning_rate": 7.305225605923699e-07, + "logits/chosen": 4.0625, + "logits/rejected": 4.04296875, + "logps/chosen": -879.0, + "logps/rejected": -677.0, + "loss": 0.4728, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.087890625, + "rewards/margins": 5.9765625, + "rewards/rejected": -3.88671875, + "step": 2177 + }, + { + "epoch": 0.4321642938637829, + "grad_norm": 28.166032623324813, + "learning_rate": 7.302370289963773e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 3.85546875, + "logps/chosen": -774.0, + "logps/rejected": -730.5, + "loss": 0.4681, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.2021484375, + "rewards/margins": 6.23828125, + "rewards/rejected": -4.0390625, + "step": 2178 + }, + { + "epoch": 0.43236271640458357, + "grad_norm": 28.16214309042093, + "learning_rate": 7.299514109438834e-07, + "logits/chosen": 3.71484375, + "logits/rejected": 3.8671875, + "logps/chosen": -1088.0, + "logps/rejected": -1568.0, + "loss": 0.3488, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.998046875, + "rewards/margins": 10.0390625, + "rewards/rejected": -7.03515625, + "step": 2179 + }, + { + "epoch": 0.43256113894538417, + "grad_norm": 30.943959705400417, + "learning_rate": 7.296657065718943e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 4.12109375, + "logps/chosen": -851.0, + "logps/rejected": -875.0, + "loss": 0.3692, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.265625, + "rewards/margins": 7.7265625, + "rewards/rejected": -5.4609375, + "step": 2180 + }, + { + "epoch": 0.43275956148618483, + "grad_norm": 30.18890323027476, + "learning_rate": 7.293799160174572e-07, + "logits/chosen": 4.5390625, + "logits/rejected": 4.34375, + "logps/chosen": -1254.0, + "logps/rejected": -716.0, + "loss": 0.4445, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.53125, + "rewards/margins": 6.4453125, + "rewards/rejected": -3.921875, + "step": 2181 + }, + { + "epoch": 0.4329579840269855, + "grad_norm": 43.14626696164631, + "learning_rate": 7.290940394176606e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 3.92578125, + "logps/chosen": -1242.0, + "logps/rejected": -679.0, + "loss": 0.482, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.08984375, + "rewards/margins": 5.9765625, + "rewards/rejected": -3.87890625, + "step": 2182 + }, + { + "epoch": 0.4331564065677861, + "grad_norm": 41.31301394921071, + "learning_rate": 7.288080769096347e-07, + "logits/chosen": 4.0, + "logits/rejected": 4.05078125, + "logps/chosen": -867.0, + "logps/rejected": -572.5, + "loss": 0.5203, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.841796875, + "rewards/margins": 4.8359375, + "rewards/rejected": -3.0, + "step": 2183 + }, + { + "epoch": 0.43335482910858675, + "grad_norm": 25.940098884473986, + "learning_rate": 7.285220286305507e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 4.03515625, + "logps/chosen": -1226.0, + "logps/rejected": -645.0, + "loss": 0.336, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.384765625, + "rewards/margins": 7.1953125, + "rewards/rejected": -4.80859375, + "step": 2184 + }, + { + "epoch": 0.43355325164938735, + "grad_norm": 20.272804055014777, + "learning_rate": 7.282358947176205e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.21484375, + "logps/chosen": -1143.0, + "logps/rejected": -1092.5, + "loss": 0.2674, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.421875, + "rewards/margins": 10.0078125, + "rewards/rejected": -6.6015625, + "step": 2185 + }, + { + "epoch": 0.433751674190188, + "grad_norm": 31.022736127607093, + "learning_rate": 7.279496753080978e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 4.1328125, + "logps/chosen": -824.0, + "logps/rejected": -705.0, + "loss": 0.5225, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.015625, + "rewards/margins": 5.95703125, + "rewards/rejected": -3.9296875, + "step": 2186 + }, + { + "epoch": 0.43395009673098867, + "grad_norm": 31.58339805152667, + "learning_rate": 7.276633705392766e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.4296875, + "logps/chosen": -1124.0, + "logps/rejected": -699.5, + "loss": 0.4373, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0, + "rewards/margins": 6.5703125, + "rewards/rejected": -3.560546875, + "step": 2187 + }, + { + "epoch": 0.43414851927178927, + "grad_norm": 32.73695297187011, + "learning_rate": 7.273769805484927e-07, + "logits/chosen": 4.25, + "logits/rejected": 4.37890625, + "logps/chosen": -937.0, + "logps/rejected": -883.0, + "loss": 0.4667, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.0537109375, + "rewards/margins": 6.140625, + "rewards/rejected": -4.07421875, + "step": 2188 + }, + { + "epoch": 0.4343469418125899, + "grad_norm": 33.98169299887217, + "learning_rate": 7.270905054731218e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 3.9921875, + "logps/chosen": -661.0, + "logps/rejected": -522.5, + "loss": 0.4976, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.080078125, + "rewards/margins": 5.76171875, + "rewards/rejected": -3.68359375, + "step": 2189 + }, + { + "epoch": 0.43454536435339053, + "grad_norm": 24.096316752168423, + "learning_rate": 7.268039454505813e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.4921875, + "logps/chosen": -1450.0, + "logps/rejected": -1525.0, + "loss": 0.2575, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2392578125, + "rewards/margins": 12.078125, + "rewards/rejected": -8.84375, + "step": 2190 + }, + { + "epoch": 0.4347437868941912, + "grad_norm": 29.430035700456127, + "learning_rate": 7.265173006183287e-07, + "logits/chosen": 3.77734375, + "logits/rejected": 3.6875, + "logps/chosen": -776.0, + "logps/rejected": -499.0, + "loss": 0.4348, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.99609375, + "rewards/margins": 5.9921875, + "rewards/rejected": -3.98828125, + "step": 2191 + }, + { + "epoch": 0.4349422094349918, + "grad_norm": 38.584516207396916, + "learning_rate": 7.262305711138627e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 4.2265625, + "logps/chosen": -950.0, + "logps/rejected": -1035.5, + "loss": 0.5596, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.53955078125, + "rewards/margins": 7.84375, + "rewards/rejected": -6.30859375, + "step": 2192 + }, + { + "epoch": 0.43514063197579245, + "grad_norm": 36.005852699997604, + "learning_rate": 7.259437570747219e-07, + "logits/chosen": 3.91015625, + "logits/rejected": 4.12890625, + "logps/chosen": -661.5, + "logps/rejected": -627.0, + "loss": 0.6386, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.3017578125, + "rewards/margins": 4.1171875, + "rewards/rejected": -2.818359375, + "step": 2193 + }, + { + "epoch": 0.4353390545165931, + "grad_norm": 31.065455532446528, + "learning_rate": 7.256568586384862e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 4.3046875, + "logps/chosen": -976.0, + "logps/rejected": -1522.5, + "loss": 0.4721, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.78125, + "rewards/margins": 8.48828125, + "rewards/rejected": -7.6796875, + "step": 2194 + }, + { + "epoch": 0.4355374770573937, + "grad_norm": 25.683617049657308, + "learning_rate": 7.253698759427757e-07, + "logits/chosen": 3.6640625, + "logits/rejected": 3.9296875, + "logps/chosen": -830.0, + "logps/rejected": -633.0, + "loss": 0.3959, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.59765625, + "rewards/margins": 6.9609375, + "rewards/rejected": -4.359375, + "step": 2195 + }, + { + "epoch": 0.43573589959819437, + "grad_norm": 38.1605889829388, + "learning_rate": 7.250828091252509e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 4.1484375, + "logps/chosen": -765.0, + "logps/rejected": -664.0, + "loss": 0.5244, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.4638671875, + "rewards/margins": 6.03125, + "rewards/rejected": -4.55859375, + "step": 2196 + }, + { + "epoch": 0.43593432213899497, + "grad_norm": 31.298997509211173, + "learning_rate": 7.247956583236127e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.171875, + "logps/chosen": -908.5, + "logps/rejected": -757.0, + "loss": 0.4536, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3701171875, + "rewards/margins": 7.69921875, + "rewards/rejected": -5.3359375, + "step": 2197 + }, + { + "epoch": 0.4361327446797956, + "grad_norm": 28.98717934587776, + "learning_rate": 7.245084236756019e-07, + "logits/chosen": 3.75, + "logits/rejected": 3.9140625, + "logps/chosen": -988.5, + "logps/rejected": -675.5, + "loss": 0.339, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.46826171875, + "rewards/margins": 7.2421875, + "rewards/rejected": -5.765625, + "step": 2198 + }, + { + "epoch": 0.4363311672205963, + "grad_norm": 27.20793065468965, + "learning_rate": 7.242211053190002e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.796875, + "logps/chosen": -863.0, + "logps/rejected": -843.0, + "loss": 0.3822, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.79296875, + "rewards/margins": 18.359375, + "rewards/rejected": -16.6015625, + "step": 2199 + }, + { + "epoch": 0.4365295897613969, + "grad_norm": 26.039540742503004, + "learning_rate": 7.239337033916291e-07, + "logits/chosen": 4.03125, + "logits/rejected": 3.73828125, + "logps/chosen": -1126.0, + "logps/rejected": -708.0, + "loss": 0.3236, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.869140625, + "rewards/margins": 8.4453125, + "rewards/rejected": -5.5546875, + "step": 2200 + }, + { + "epoch": 0.43672801230219754, + "grad_norm": 33.47920657087908, + "learning_rate": 7.236462180313502e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.1171875, + "logps/chosen": -1187.0, + "logps/rejected": -821.0, + "loss": 0.4501, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3291015625, + "rewards/margins": 7.625, + "rewards/rejected": -5.28125, + "step": 2201 + }, + { + "epoch": 0.43692643484299815, + "grad_norm": 31.573928070769792, + "learning_rate": 7.233586493760649e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.83203125, + "logps/chosen": -941.0, + "logps/rejected": -560.0, + "loss": 0.346, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.44140625, + "rewards/margins": 6.9296875, + "rewards/rejected": -4.48046875, + "step": 2202 + }, + { + "epoch": 0.4371248573837988, + "grad_norm": 36.13673072672162, + "learning_rate": 7.230709975637148e-07, + "logits/chosen": 3.5546875, + "logits/rejected": 3.54296875, + "logps/chosen": -985.0, + "logps/rejected": -665.5, + "loss": 0.5088, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.18408203125, + "rewards/margins": 4.96875, + "rewards/rejected": -3.779296875, + "step": 2203 + }, + { + "epoch": 0.43732327992459946, + "grad_norm": 34.379298556871646, + "learning_rate": 7.227832627322815e-07, + "logits/chosen": 3.71484375, + "logits/rejected": 3.80078125, + "logps/chosen": -831.0, + "logps/rejected": -620.0, + "loss": 0.5279, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.45703125, + "rewards/margins": 5.52734375, + "rewards/rejected": -4.07421875, + "step": 2204 + }, + { + "epoch": 0.43752170246540006, + "grad_norm": 39.537587615747995, + "learning_rate": 7.224954450197863e-07, + "logits/chosen": 4.0625, + "logits/rejected": 3.83203125, + "logps/chosen": -1058.0, + "logps/rejected": -693.5, + "loss": 0.4672, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.072265625, + "rewards/margins": 7.2109375, + "rewards/rejected": -5.13671875, + "step": 2205 + }, + { + "epoch": 0.4377201250062007, + "grad_norm": 31.241953200615338, + "learning_rate": 7.222075445642904e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.87890625, + "logps/chosen": -1011.0, + "logps/rejected": -547.0, + "loss": 0.5417, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.3583984375, + "rewards/margins": 6.13671875, + "rewards/rejected": -3.771484375, + "step": 2206 + }, + { + "epoch": 0.4379185475470013, + "grad_norm": 40.350686511422325, + "learning_rate": 7.219195615038942e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.1171875, + "logps/chosen": -1097.0, + "logps/rejected": -845.0, + "loss": 0.4841, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.203125, + "rewards/margins": 6.8046875, + "rewards/rejected": -4.59375, + "step": 2207 + }, + { + "epoch": 0.438116970087802, + "grad_norm": 33.47309882811642, + "learning_rate": 7.216314959767383e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.265625, + "logps/chosen": -871.0, + "logps/rejected": -698.5, + "loss": 0.4584, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.384765625, + "rewards/margins": 6.1171875, + "rewards/rejected": -3.7265625, + "step": 2208 + }, + { + "epoch": 0.4383153926286026, + "grad_norm": 31.362394498076675, + "learning_rate": 7.213433481210023e-07, + "logits/chosen": 3.52734375, + "logits/rejected": 3.69921875, + "logps/chosen": -999.0, + "logps/rejected": -689.5, + "loss": 0.3893, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.67578125, + "rewards/margins": 7.953125, + "rewards/rejected": -5.28125, + "step": 2209 + }, + { + "epoch": 0.43851381516940324, + "grad_norm": 34.684186113076414, + "learning_rate": 7.21055118074906e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 4.140625, + "logps/chosen": -1033.0, + "logps/rejected": -798.0, + "loss": 0.2353, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.724609375, + "rewards/margins": 13.265625, + "rewards/rejected": -10.5, + "step": 2210 + }, + { + "epoch": 0.4387122377102039, + "grad_norm": 32.23797097752592, + "learning_rate": 7.207668059767079e-07, + "logits/chosen": 4.546875, + "logits/rejected": 4.5234375, + "logps/chosen": -623.5, + "logps/rejected": -545.0, + "loss": 0.4645, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.5869140625, + "rewards/margins": 5.5234375, + "rewards/rejected": -3.9375, + "step": 2211 + }, + { + "epoch": 0.4389106602510045, + "grad_norm": 36.715027901476205, + "learning_rate": 7.204784119647065e-07, + "logits/chosen": 3.76171875, + "logits/rejected": 3.74609375, + "logps/chosen": -884.0, + "logps/rejected": -662.0, + "loss": 0.4866, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.3759765625, + "rewards/margins": 4.73828125, + "rewards/rejected": -3.3515625, + "step": 2212 + }, + { + "epoch": 0.43910908279180516, + "grad_norm": 31.499999783208292, + "learning_rate": 7.201899361772391e-07, + "logits/chosen": 3.73046875, + "logits/rejected": 3.16015625, + "logps/chosen": -1046.0, + "logps/rejected": -697.0, + "loss": 0.4203, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.1171875, + "rewards/margins": 6.375, + "rewards/rejected": -4.2578125, + "step": 2213 + }, + { + "epoch": 0.43930750533260576, + "grad_norm": 24.461987924581095, + "learning_rate": 7.199013787526823e-07, + "logits/chosen": 4.8046875, + "logits/rejected": 4.7578125, + "logps/chosen": -1136.0, + "logps/rejected": -1694.0, + "loss": 0.4352, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.798828125, + "rewards/margins": 9.2421875, + "rewards/rejected": -6.44921875, + "step": 2214 + }, + { + "epoch": 0.4395059278734064, + "grad_norm": 30.848995155560694, + "learning_rate": 7.196127398294524e-07, + "logits/chosen": 4.5703125, + "logits/rejected": 4.265625, + "logps/chosen": -1024.0, + "logps/rejected": -702.0, + "loss": 0.2935, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1015625, + "rewards/margins": 6.8671875, + "rewards/rejected": -4.765625, + "step": 2215 + }, + { + "epoch": 0.4397043504142071, + "grad_norm": 30.8858933812884, + "learning_rate": 7.193240195460039e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 4.0859375, + "logps/chosen": -845.5, + "logps/rejected": -1492.0, + "loss": 0.3734, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.1875, + "rewards/margins": 8.9296875, + "rewards/rejected": -6.75, + "step": 2216 + }, + { + "epoch": 0.4399027729550077, + "grad_norm": 33.24177720292188, + "learning_rate": 7.19035218040831e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 3.8671875, + "logps/chosen": -753.0, + "logps/rejected": -612.0, + "loss": 0.5832, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.38671875, + "rewards/margins": 4.32421875, + "rewards/rejected": -2.939453125, + "step": 2217 + }, + { + "epoch": 0.44010119549580834, + "grad_norm": 34.37360497692542, + "learning_rate": 7.187463354524666e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.5, + "logps/chosen": -1013.0, + "logps/rejected": -617.0, + "loss": 0.3542, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.818359375, + "rewards/margins": 8.0078125, + "rewards/rejected": -5.1875, + "step": 2218 + }, + { + "epoch": 0.44029961803660894, + "grad_norm": 30.24619292087409, + "learning_rate": 7.184573719194827e-07, + "logits/chosen": 4.5234375, + "logits/rejected": 4.31640625, + "logps/chosen": -681.5, + "logps/rejected": -512.0, + "loss": 0.452, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.68701171875, + "rewards/margins": 5.984375, + "rewards/rejected": -4.283203125, + "step": 2219 + }, + { + "epoch": 0.4404980405774096, + "grad_norm": 30.690762311754295, + "learning_rate": 7.181683275804897e-07, + "logits/chosen": 4.015625, + "logits/rejected": 4.2421875, + "logps/chosen": -1234.5, + "logps/rejected": -804.0, + "loss": 0.4162, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8878173828125, + "rewards/margins": 7.9140625, + "rewards/rejected": -6.015625, + "step": 2220 + }, + { + "epoch": 0.4406964631182102, + "grad_norm": 34.04747933197959, + "learning_rate": 7.178792025741372e-07, + "logits/chosen": 4.47265625, + "logits/rejected": 4.16796875, + "logps/chosen": -1031.0, + "logps/rejected": -1383.0, + "loss": 0.4368, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.623046875, + "rewards/margins": 10.390625, + "rewards/rejected": -8.77734375, + "step": 2221 + }, + { + "epoch": 0.44089488565901086, + "grad_norm": 28.716555101134045, + "learning_rate": 7.175899970391134e-07, + "logits/chosen": 3.73828125, + "logits/rejected": 3.54296875, + "logps/chosen": -711.0, + "logps/rejected": -1418.5, + "loss": 0.4345, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.0869140625, + "rewards/margins": 8.1484375, + "rewards/rejected": -6.0546875, + "step": 2222 + }, + { + "epoch": 0.4410933081998115, + "grad_norm": 34.46405315618071, + "learning_rate": 7.173007111141449e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.078125, + "logps/chosen": -880.0, + "logps/rejected": -470.5, + "loss": 0.544, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.146484375, + "rewards/margins": 5.578125, + "rewards/rejected": -3.419921875, + "step": 2223 + }, + { + "epoch": 0.4412917307406121, + "grad_norm": 25.961732303908, + "learning_rate": 7.17011344937997e-07, + "logits/chosen": 3.5625, + "logits/rejected": 3.515625, + "logps/chosen": -1017.0, + "logps/rejected": -801.5, + "loss": 0.4527, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.70916748046875, + "rewards/margins": 7.6171875, + "rewards/rejected": -5.9140625, + "step": 2224 + }, + { + "epoch": 0.4414901532814128, + "grad_norm": 36.35350713570023, + "learning_rate": 7.167218986494737e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 3.75, + "logps/chosen": -1134.0, + "logps/rejected": -772.5, + "loss": 0.2361, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.16796875, + "rewards/margins": 10.015625, + "rewards/rejected": -6.859375, + "step": 2225 + }, + { + "epoch": 0.4416885758222134, + "grad_norm": 37.53073747457325, + "learning_rate": 7.164323723874171e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.28125, + "logps/chosen": -822.0, + "logps/rejected": -1657.5, + "loss": 0.5279, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.14453125, + "rewards/margins": 7.921875, + "rewards/rejected": -5.765625, + "step": 2226 + }, + { + "epoch": 0.44188699836301404, + "grad_norm": 32.91460624106012, + "learning_rate": 7.161427662907079e-07, + "logits/chosen": 3.87109375, + "logits/rejected": 3.6875, + "logps/chosen": -729.0, + "logps/rejected": -522.5, + "loss": 0.5252, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.5078125, + "rewards/margins": 4.875, + "rewards/rejected": -3.3671875, + "step": 2227 + }, + { + "epoch": 0.4420854209038147, + "grad_norm": 28.851025630056398, + "learning_rate": 7.15853080498265e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 4.21875, + "logps/chosen": -757.0, + "logps/rejected": -1506.0, + "loss": 0.3788, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.234375, + "rewards/margins": 9.9765625, + "rewards/rejected": -6.7421875, + "step": 2228 + }, + { + "epoch": 0.4422838434446153, + "grad_norm": 33.00228117817701, + "learning_rate": 7.155633151490456e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 4.05078125, + "logps/chosen": -1303.0, + "logps/rejected": -891.0, + "loss": 0.4164, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.8203125, + "rewards/margins": 7.40625, + "rewards/rejected": -4.59375, + "step": 2229 + }, + { + "epoch": 0.44248226598541596, + "grad_norm": 37.29223664715947, + "learning_rate": 7.152734703820444e-07, + "logits/chosen": 4.44140625, + "logits/rejected": 4.4375, + "logps/chosen": -899.0, + "logps/rejected": -628.0, + "loss": 0.4285, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.860595703125, + "rewards/margins": 6.23828125, + "rewards/rejected": -4.375, + "step": 2230 + }, + { + "epoch": 0.44268068852621656, + "grad_norm": 38.063622462507155, + "learning_rate": 7.149835463362957e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.1328125, + "logps/chosen": -849.0, + "logps/rejected": -627.5, + "loss": 0.5657, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.587890625, + "rewards/margins": 5.38671875, + "rewards/rejected": -3.8046875, + "step": 2231 + }, + { + "epoch": 0.4428791110670172, + "grad_norm": 36.28219492966895, + "learning_rate": 7.146935431508704e-07, + "logits/chosen": 4.5625, + "logits/rejected": 4.9296875, + "logps/chosen": -784.5, + "logps/rejected": -896.0, + "loss": 0.5297, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.365234375, + "rewards/margins": 6.44140625, + "rewards/rejected": -4.0791015625, + "step": 2232 + }, + { + "epoch": 0.4430775336078179, + "grad_norm": 32.12428583925194, + "learning_rate": 7.144034609648778e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.40625, + "logps/chosen": -1077.0, + "logps/rejected": -738.0, + "loss": 0.4438, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.109375, + "rewards/margins": 6.328125, + "rewards/rejected": -4.234375, + "step": 2233 + }, + { + "epoch": 0.4432759561486185, + "grad_norm": 32.540437187971676, + "learning_rate": 7.141132999174653e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.39453125, + "logps/chosen": -1027.0, + "logps/rejected": -758.0, + "loss": 0.4356, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.94140625, + "rewards/margins": 7.0234375, + "rewards/rejected": -4.06640625, + "step": 2234 + }, + { + "epoch": 0.44347437868941914, + "grad_norm": 32.243200641330596, + "learning_rate": 7.138230601478181e-07, + "logits/chosen": 4.5703125, + "logits/rejected": 4.33984375, + "logps/chosen": -992.0, + "logps/rejected": -529.0, + "loss": 0.3483, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.556640625, + "rewards/margins": 7.2265625, + "rewards/rejected": -4.65625, + "step": 2235 + }, + { + "epoch": 0.44367280123021974, + "grad_norm": 30.228282902832607, + "learning_rate": 7.13532741795159e-07, + "logits/chosen": 3.65625, + "logits/rejected": 3.640625, + "logps/chosen": -711.5, + "logps/rejected": -559.0, + "loss": 0.5036, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.86328125, + "rewards/margins": 4.9375, + "rewards/rejected": -3.0859375, + "step": 2236 + }, + { + "epoch": 0.4438712237710204, + "grad_norm": 24.687721420276503, + "learning_rate": 7.132423449987485e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.17578125, + "logps/chosen": -701.0, + "logps/rejected": -562.0, + "loss": 0.3556, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.8486328125, + "rewards/margins": 6.8828125, + "rewards/rejected": -5.0390625, + "step": 2237 + }, + { + "epoch": 0.444069646311821, + "grad_norm": 37.19853143682292, + "learning_rate": 7.129518698978849e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.21484375, + "logps/chosen": -1052.0, + "logps/rejected": -632.5, + "loss": 0.5305, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.525390625, + "rewards/margins": 5.875, + "rewards/rejected": -3.35546875, + "step": 2238 + }, + { + "epoch": 0.44426806885262166, + "grad_norm": 33.766917297839484, + "learning_rate": 7.126613166319039e-07, + "logits/chosen": 3.8828125, + "logits/rejected": 3.56640625, + "logps/chosen": -1236.0, + "logps/rejected": -769.0, + "loss": 0.2806, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.875, + "rewards/margins": 7.578125, + "rewards/rejected": -4.69921875, + "step": 2239 + }, + { + "epoch": 0.4444664913934223, + "grad_norm": 27.743791299270242, + "learning_rate": 7.123706853401784e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.09765625, + "logps/chosen": -1075.0, + "logps/rejected": -760.5, + "loss": 0.434, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.572265625, + "rewards/margins": 5.53125, + "rewards/rejected": -2.9609375, + "step": 2240 + }, + { + "epoch": 0.4446649139342229, + "grad_norm": 26.59641895292181, + "learning_rate": 7.120799761621197e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.0703125, + "logps/chosen": -1360.0, + "logps/rejected": -830.0, + "loss": 0.2691, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.30859375, + "rewards/margins": 9.4375, + "rewards/rejected": -6.140625, + "step": 2241 + }, + { + "epoch": 0.4448633364750236, + "grad_norm": 29.033316014328623, + "learning_rate": 7.117891892371753e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 3.7890625, + "logps/chosen": -965.0, + "logps/rejected": -725.5, + "loss": 0.3207, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7421875, + "rewards/margins": 17.1796875, + "rewards/rejected": -14.44921875, + "step": 2242 + }, + { + "epoch": 0.4450617590158242, + "grad_norm": 33.840262550767406, + "learning_rate": 7.114983247048309e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.203125, + "logps/chosen": -1050.0, + "logps/rejected": -848.0, + "loss": 0.2939, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.921875, + "rewards/margins": 8.46875, + "rewards/rejected": -5.5625, + "step": 2243 + }, + { + "epoch": 0.44526018155662483, + "grad_norm": 32.13200974364578, + "learning_rate": 7.112073827046088e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 3.7734375, + "logps/chosen": -1314.0, + "logps/rejected": -640.0, + "loss": 0.3623, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4921875, + "rewards/margins": 7.8515625, + "rewards/rejected": -5.3515625, + "step": 2244 + }, + { + "epoch": 0.4454586040974255, + "grad_norm": 30.75324847001287, + "learning_rate": 7.109163633760688e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.484375, + "logps/chosen": -990.0, + "logps/rejected": -609.5, + "loss": 0.3183, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.88671875, + "rewards/margins": 7.9921875, + "rewards/rejected": -5.109375, + "step": 2245 + }, + { + "epoch": 0.4456570266382261, + "grad_norm": 27.568911555805343, + "learning_rate": 7.106252668588076e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.5859375, + "logps/chosen": -1074.0, + "logps/rejected": -647.0, + "loss": 0.293, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.919921875, + "rewards/margins": 8.4140625, + "rewards/rejected": -5.4765625, + "step": 2246 + }, + { + "epoch": 0.44585544917902675, + "grad_norm": 36.11994200245718, + "learning_rate": 7.103340932924593e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 3.9765625, + "logps/chosen": -1397.0, + "logps/rejected": -1009.0, + "loss": 0.2706, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.056640625, + "rewards/margins": 8.7734375, + "rewards/rejected": -5.7265625, + "step": 2247 + }, + { + "epoch": 0.44605387171982736, + "grad_norm": 40.325377972638364, + "learning_rate": 7.100428428166945e-07, + "logits/chosen": 4.6875, + "logits/rejected": 4.23828125, + "logps/chosen": -1336.0, + "logps/rejected": -722.0, + "loss": 0.4261, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.21875, + "rewards/margins": 7.47265625, + "rewards/rejected": -5.25390625, + "step": 2248 + }, + { + "epoch": 0.446252294260628, + "grad_norm": 31.777808462923527, + "learning_rate": 7.097515155712207e-07, + "logits/chosen": 3.828125, + "logits/rejected": 3.67578125, + "logps/chosen": -1026.0, + "logps/rejected": -781.0, + "loss": 0.4903, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.280029296875, + "rewards/margins": 6.0546875, + "rewards/rejected": -4.7734375, + "step": 2249 + }, + { + "epoch": 0.4464507168014286, + "grad_norm": 41.382748775554646, + "learning_rate": 7.094601116957826e-07, + "logits/chosen": 3.96875, + "logits/rejected": 4.01953125, + "logps/chosen": -961.5, + "logps/rejected": -540.5, + "loss": 0.5269, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.6953125, + "rewards/margins": 6.0078125, + "rewards/rejected": -4.310546875, + "step": 2250 + }, + { + "epoch": 0.4466491393422293, + "grad_norm": 27.880567125332956, + "learning_rate": 7.091686313301615e-07, + "logits/chosen": 3.69140625, + "logits/rejected": 3.984375, + "logps/chosen": -1456.0, + "logps/rejected": -1361.0, + "loss": 0.3516, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.27734375, + "rewards/margins": 10.234375, + "rewards/rejected": -6.984375, + "step": 2251 + }, + { + "epoch": 0.44684756188302993, + "grad_norm": 38.15623665132649, + "learning_rate": 7.088770746141755e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.3359375, + "logps/chosen": -1017.0, + "logps/rejected": -926.0, + "loss": 0.4526, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.00927734375, + "rewards/margins": 7.3828125, + "rewards/rejected": -5.37109375, + "step": 2252 + }, + { + "epoch": 0.44704598442383053, + "grad_norm": 39.009664882818306, + "learning_rate": 7.085854416876788e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.01953125, + "logps/chosen": -942.0, + "logps/rejected": -687.0, + "loss": 0.4582, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.75, + "rewards/margins": 6.6875, + "rewards/rejected": -4.91796875, + "step": 2253 + }, + { + "epoch": 0.4472444069646312, + "grad_norm": 37.47708287390915, + "learning_rate": 7.082937326905628e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.06640625, + "logps/chosen": -1211.0, + "logps/rejected": -781.0, + "loss": 0.3383, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.294921875, + "rewards/margins": 8.21875, + "rewards/rejected": -5.9140625, + "step": 2254 + }, + { + "epoch": 0.4474428295054318, + "grad_norm": 26.660017117115064, + "learning_rate": 7.08001947762755e-07, + "logits/chosen": 4.5703125, + "logits/rejected": 4.421875, + "logps/chosen": -1057.0, + "logps/rejected": -849.0, + "loss": 0.4664, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.506591796875, + "rewards/margins": 6.638671875, + "rewards/rejected": -4.1171875, + "step": 2255 + }, + { + "epoch": 0.44764125204623245, + "grad_norm": 34.43940685356045, + "learning_rate": 7.077100870442194e-07, + "logits/chosen": 3.984375, + "logits/rejected": 3.828125, + "logps/chosen": -1149.0, + "logps/rejected": -645.5, + "loss": 0.3506, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.224609375, + "rewards/margins": 6.984375, + "rewards/rejected": -4.765625, + "step": 2256 + }, + { + "epoch": 0.4478396745870331, + "grad_norm": 36.310774815931005, + "learning_rate": 7.074181506749567e-07, + "logits/chosen": 4.19921875, + "logits/rejected": 4.1171875, + "logps/chosen": -1161.0, + "logps/rejected": -798.0, + "loss": 0.3575, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.23828125, + "rewards/margins": 6.69140625, + "rewards/rejected": -3.44921875, + "step": 2257 + }, + { + "epoch": 0.4480380971278337, + "grad_norm": 32.56592447231199, + "learning_rate": 7.071261387950031e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 4.078125, + "logps/chosen": -803.0, + "logps/rejected": -753.5, + "loss": 0.4742, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.197265625, + "rewards/margins": 5.859375, + "rewards/rejected": -3.666015625, + "step": 2258 + }, + { + "epoch": 0.44823651966863437, + "grad_norm": 30.039960567808304, + "learning_rate": 7.068340515444321e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.11328125, + "logps/chosen": -1321.0, + "logps/rejected": -753.5, + "loss": 0.386, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.46875, + "rewards/margins": 7.0546875, + "rewards/rejected": -5.5859375, + "step": 2259 + }, + { + "epoch": 0.448434942209435, + "grad_norm": 33.006797854794904, + "learning_rate": 7.065418890633521e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 4.1484375, + "logps/chosen": -922.0, + "logps/rejected": -814.5, + "loss": 0.4691, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.953125, + "rewards/margins": 6.84375, + "rewards/rejected": -4.88671875, + "step": 2260 + }, + { + "epoch": 0.44863336475023563, + "grad_norm": 29.359396692389232, + "learning_rate": 7.06249651491909e-07, + "logits/chosen": 4.41796875, + "logits/rejected": 4.484375, + "logps/chosen": -934.0, + "logps/rejected": -672.0, + "loss": 0.5261, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.044921875, + "rewards/margins": 4.921875, + "rewards/rejected": -2.87109375, + "step": 2261 + }, + { + "epoch": 0.4488317872910363, + "grad_norm": 32.10618285316815, + "learning_rate": 7.059573389702833e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.28515625, + "logps/chosen": -1125.0, + "logps/rejected": -722.0, + "loss": 0.3602, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.77734375, + "rewards/margins": 6.46484375, + "rewards/rejected": -3.685546875, + "step": 2262 + }, + { + "epoch": 0.4490302098318369, + "grad_norm": 24.615523194670175, + "learning_rate": 7.056649516386927e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.140625, + "logps/chosen": -1344.5, + "logps/rejected": -789.5, + "loss": 0.3114, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.6640625, + "rewards/margins": 8.6328125, + "rewards/rejected": -4.96875, + "step": 2263 + }, + { + "epoch": 0.44922863237263755, + "grad_norm": 32.14823950794793, + "learning_rate": 7.053724896373898e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.19921875, + "logps/chosen": -895.0, + "logps/rejected": -857.0, + "loss": 0.3795, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.1171875, + "rewards/margins": 7.3515625, + "rewards/rejected": -5.234375, + "step": 2264 + }, + { + "epoch": 0.44942705491343815, + "grad_norm": 31.686998045535447, + "learning_rate": 7.050799531066633e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 3.91015625, + "logps/chosen": -1023.0, + "logps/rejected": -554.5, + "loss": 0.4553, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.8265380859375, + "rewards/margins": 6.8125, + "rewards/rejected": -4.9765625, + "step": 2265 + }, + { + "epoch": 0.4496254774542388, + "grad_norm": 33.535401110945, + "learning_rate": 7.047873421868383e-07, + "logits/chosen": 4.078125, + "logits/rejected": 4.2109375, + "logps/chosen": -764.0, + "logps/rejected": -801.5, + "loss": 0.4087, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.10546875, + "rewards/margins": 6.87890625, + "rewards/rejected": -4.76953125, + "step": 2266 + }, + { + "epoch": 0.4498238999950394, + "grad_norm": 28.851979832633045, + "learning_rate": 7.044946570182746e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.88671875, + "logps/chosen": -724.0, + "logps/rejected": -704.0, + "loss": 0.3544, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.224609375, + "rewards/margins": 7.40625, + "rewards/rejected": -5.171875, + "step": 2267 + }, + { + "epoch": 0.45002232253584007, + "grad_norm": 29.872189986693776, + "learning_rate": 7.042018977413684e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 3.515625, + "logps/chosen": -1597.0, + "logps/rejected": -699.5, + "loss": 0.3219, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.80859375, + "rewards/margins": 9.2734375, + "rewards/rejected": -6.46875, + "step": 2268 + }, + { + "epoch": 0.4502207450766407, + "grad_norm": 36.77977305666732, + "learning_rate": 7.039090644965509e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.171875, + "logps/chosen": -992.0, + "logps/rejected": -666.0, + "loss": 0.4218, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.095703125, + "rewards/margins": 6.3984375, + "rewards/rejected": -4.30078125, + "step": 2269 + }, + { + "epoch": 0.45041916761744133, + "grad_norm": 46.56692345839905, + "learning_rate": 7.036161574242892e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.2265625, + "logps/chosen": -1186.0, + "logps/rejected": -954.0, + "loss": 0.457, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.197265625, + "rewards/margins": 7.34765625, + "rewards/rejected": -5.14453125, + "step": 2270 + }, + { + "epoch": 0.450617590158242, + "grad_norm": 45.68411339930738, + "learning_rate": 7.033231766650853e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 4.01171875, + "logps/chosen": -1578.0, + "logps/rejected": -924.0, + "loss": 0.3401, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.05859375, + "rewards/margins": 10.3203125, + "rewards/rejected": -7.234375, + "step": 2271 + }, + { + "epoch": 0.4508160126990426, + "grad_norm": 28.170169100811112, + "learning_rate": 7.030301223594773e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.015625, + "logps/chosen": -970.0, + "logps/rejected": -928.0, + "loss": 0.4703, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.53125, + "rewards/margins": 7.8828125, + "rewards/rejected": -5.359375, + "step": 2272 + }, + { + "epoch": 0.45101443523984325, + "grad_norm": 29.690084148833705, + "learning_rate": 7.027369946480379e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.04296875, + "logps/chosen": -987.0, + "logps/rejected": -666.5, + "loss": 0.399, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.552734375, + "rewards/margins": 6.71875, + "rewards/rejected": -4.16796875, + "step": 2273 + }, + { + "epoch": 0.4512128577806439, + "grad_norm": 30.530791823169512, + "learning_rate": 7.024437936713751e-07, + "logits/chosen": 4.37890625, + "logits/rejected": 4.4453125, + "logps/chosen": -1081.0, + "logps/rejected": -680.5, + "loss": 0.5173, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.0546875, + "rewards/margins": 5.9921875, + "rewards/rejected": -3.93359375, + "step": 2274 + }, + { + "epoch": 0.4514112803214445, + "grad_norm": 42.464182271685, + "learning_rate": 7.021505195701325e-07, + "logits/chosen": 3.7265625, + "logits/rejected": 3.92578125, + "logps/chosen": -1153.0, + "logps/rejected": -1149.0, + "loss": 0.4768, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.98388671875, + "rewards/margins": 8.99609375, + "rewards/rejected": -7.02734375, + "step": 2275 + }, + { + "epoch": 0.45160970286224517, + "grad_norm": 33.29743220209419, + "learning_rate": 7.018571724849883e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.04296875, + "logps/chosen": -845.0, + "logps/rejected": -785.0, + "loss": 0.3871, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.400390625, + "rewards/margins": 6.8828125, + "rewards/rejected": -4.490234375, + "step": 2276 + }, + { + "epoch": 0.45180812540304577, + "grad_norm": 30.88114698986957, + "learning_rate": 7.01563752556656e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 3.984375, + "logps/chosen": -994.0, + "logps/rejected": -880.0, + "loss": 0.3889, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.388671875, + "rewards/margins": 7.3359375, + "rewards/rejected": -4.96875, + "step": 2277 + }, + { + "epoch": 0.4520065479438464, + "grad_norm": 29.718197735579377, + "learning_rate": 7.012702599258838e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.08984375, + "logps/chosen": -792.0, + "logps/rejected": -656.0, + "loss": 0.381, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3125, + "rewards/margins": 7.0, + "rewards/rejected": -4.703125, + "step": 2278 + }, + { + "epoch": 0.45220497048464703, + "grad_norm": 30.647340103806126, + "learning_rate": 7.009766947334551e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 4.265625, + "logps/chosen": -1207.0, + "logps/rejected": -1544.0, + "loss": 0.4055, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.00390625, + "rewards/margins": 8.7265625, + "rewards/rejected": -6.73828125, + "step": 2279 + }, + { + "epoch": 0.4524033930254477, + "grad_norm": 31.1700608875396, + "learning_rate": 7.006830571201877e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.29296875, + "logps/chosen": -1169.0, + "logps/rejected": -1575.0, + "loss": 0.3727, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.6875, + "rewards/margins": 19.3515625, + "rewards/rejected": -17.734375, + "step": 2280 + }, + { + "epoch": 0.45260181556624834, + "grad_norm": 30.127841441329476, + "learning_rate": 7.003893472269344e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 3.96875, + "logps/chosen": -1036.5, + "logps/rejected": -763.0, + "loss": 0.2824, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.263671875, + "rewards/margins": 9.1328125, + "rewards/rejected": -6.859375, + "step": 2281 + }, + { + "epoch": 0.45280023810704895, + "grad_norm": 32.17025251677593, + "learning_rate": 7.000955651945826e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.26171875, + "logps/chosen": -1121.5, + "logps/rejected": -678.5, + "loss": 0.5237, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.76171875, + "rewards/margins": 5.4453125, + "rewards/rejected": -2.677734375, + "step": 2282 + }, + { + "epoch": 0.4529986606478496, + "grad_norm": 42.03449028411156, + "learning_rate": 6.998017111640544e-07, + "logits/chosen": 3.71484375, + "logits/rejected": 3.8125, + "logps/chosen": -855.0, + "logps/rejected": -832.0, + "loss": 0.5615, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.8486328125, + "rewards/margins": 5.4453125, + "rewards/rejected": -4.59375, + "step": 2283 + }, + { + "epoch": 0.4531970831886502, + "grad_norm": 34.70435643273412, + "learning_rate": 6.995077852763064e-07, + "logits/chosen": 4.484375, + "logits/rejected": 4.22265625, + "logps/chosen": -1218.0, + "logps/rejected": -903.5, + "loss": 0.4226, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.607421875, + "rewards/margins": 7.2578125, + "rewards/rejected": -5.65625, + "step": 2284 + }, + { + "epoch": 0.45339550572945087, + "grad_norm": 32.749561113877846, + "learning_rate": 6.992137876723295e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.05078125, + "logps/chosen": -903.0, + "logps/rejected": -951.5, + "loss": 0.4032, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.58935546875, + "rewards/margins": 7.34375, + "rewards/rejected": -5.765625, + "step": 2285 + }, + { + "epoch": 0.4535939282702515, + "grad_norm": 27.535808208392087, + "learning_rate": 6.989197184931491e-07, + "logits/chosen": 4.4375, + "logits/rejected": 4.328125, + "logps/chosen": -1347.0, + "logps/rejected": -1784.0, + "loss": 0.3624, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.90625, + "rewards/margins": 11.6015625, + "rewards/rejected": -8.67578125, + "step": 2286 + }, + { + "epoch": 0.4537923508110521, + "grad_norm": 28.725301183628954, + "learning_rate": 6.986255778798252e-07, + "logits/chosen": 4.0625, + "logits/rejected": 4.046875, + "logps/chosen": -939.0, + "logps/rejected": -588.0, + "loss": 0.4822, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.060546875, + "rewards/margins": 5.9765625, + "rewards/rejected": -3.91796875, + "step": 2287 + }, + { + "epoch": 0.4539907733518528, + "grad_norm": 33.23088462506256, + "learning_rate": 6.983313659734517e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 4.2265625, + "logps/chosen": -1390.0, + "logps/rejected": -973.0, + "loss": 0.3226, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8515625, + "rewards/margins": 7.765625, + "rewards/rejected": -4.91015625, + "step": 2288 + }, + { + "epoch": 0.4541891958926534, + "grad_norm": 36.59675990238306, + "learning_rate": 6.980370829151566e-07, + "logits/chosen": 4.4921875, + "logits/rejected": 4.484375, + "logps/chosen": -621.0, + "logps/rejected": -541.0, + "loss": 0.4537, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.814453125, + "rewards/margins": 5.7734375, + "rewards/rejected": -3.95703125, + "step": 2289 + }, + { + "epoch": 0.45438761843345404, + "grad_norm": 34.09262796937932, + "learning_rate": 6.977427288461025e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.28125, + "logps/chosen": -849.0, + "logps/rejected": -703.0, + "loss": 0.2866, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.126953125, + "rewards/margins": 9.53125, + "rewards/rejected": -7.421875, + "step": 2290 + }, + { + "epoch": 0.4545860409742547, + "grad_norm": 38.66207252977798, + "learning_rate": 6.974483039074857e-07, + "logits/chosen": 3.79296875, + "logits/rejected": 3.734375, + "logps/chosen": -1087.0, + "logps/rejected": -698.5, + "loss": 0.4719, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.0859375, + "rewards/margins": 6.71875, + "rewards/rejected": -4.63671875, + "step": 2291 + }, + { + "epoch": 0.4547844635150553, + "grad_norm": 29.015499133270563, + "learning_rate": 6.971538082405367e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 3.91796875, + "logps/chosen": -972.0, + "logps/rejected": -711.0, + "loss": 0.4084, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.3203125, + "rewards/margins": 6.890625, + "rewards/rejected": -4.5703125, + "step": 2292 + }, + { + "epoch": 0.45498288605585596, + "grad_norm": 36.069378839688035, + "learning_rate": 6.9685924198652e-07, + "logits/chosen": 4.5234375, + "logits/rejected": 4.640625, + "logps/chosen": -814.5, + "logps/rejected": -912.0, + "loss": 0.3569, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.427734375, + "rewards/margins": 8.453125, + "rewards/rejected": -6.03125, + "step": 2293 + }, + { + "epoch": 0.45518130859665656, + "grad_norm": 35.769550166008656, + "learning_rate": 6.965646052867335e-07, + "logits/chosen": 4.34765625, + "logits/rejected": 4.33984375, + "logps/chosen": -1050.0, + "logps/rejected": -837.0, + "loss": 0.4326, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.26953125, + "rewards/margins": 6.8125, + "rewards/rejected": -4.53125, + "step": 2294 + }, + { + "epoch": 0.4553797311374572, + "grad_norm": 28.19581214309783, + "learning_rate": 6.962698982825094e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.1171875, + "logps/chosen": -1014.0, + "logps/rejected": -617.0, + "loss": 0.3433, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.1240234375, + "rewards/margins": 7.03125, + "rewards/rejected": -4.91796875, + "step": 2295 + }, + { + "epoch": 0.4555781536782578, + "grad_norm": 30.95512730284026, + "learning_rate": 6.959751211152132e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.296875, + "logps/chosen": -748.5, + "logps/rejected": -1686.5, + "loss": 0.3302, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.181640625, + "rewards/margins": 19.9921875, + "rewards/rejected": -17.7734375, + "step": 2296 + }, + { + "epoch": 0.4557765762190585, + "grad_norm": 35.51767346878608, + "learning_rate": 6.956802739262445e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.328125, + "logps/chosen": -1080.0, + "logps/rejected": -794.0, + "loss": 0.3727, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.2578125, + "rewards/margins": 8.1640625, + "rewards/rejected": -5.9140625, + "step": 2297 + }, + { + "epoch": 0.45597499875985914, + "grad_norm": 31.636060646031986, + "learning_rate": 6.953853568570361e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 3.96875, + "logps/chosen": -1203.0, + "logps/rejected": -853.0, + "loss": 0.2847, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.873046875, + "rewards/margins": 9.3203125, + "rewards/rejected": -6.4453125, + "step": 2298 + }, + { + "epoch": 0.45617342130065974, + "grad_norm": 38.33053044101982, + "learning_rate": 6.950903700490545e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.3671875, + "logps/chosen": -1210.0, + "logps/rejected": -1520.0, + "loss": 0.4055, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.2158203125, + "rewards/margins": 8.6328125, + "rewards/rejected": -6.41015625, + "step": 2299 + }, + { + "epoch": 0.4563718438414604, + "grad_norm": 35.92682727490381, + "learning_rate": 6.947953136437997e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.21875, + "logps/chosen": -1088.0, + "logps/rejected": -677.0, + "loss": 0.4571, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.3896484375, + "rewards/margins": 7.1484375, + "rewards/rejected": -4.75390625, + "step": 2300 + }, + { + "epoch": 0.456570266382261, + "grad_norm": 38.717705189737615, + "learning_rate": 6.945001877828049e-07, + "logits/chosen": 4.171875, + "logits/rejected": 3.87109375, + "logps/chosen": -823.0, + "logps/rejected": -664.5, + "loss": 0.4315, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.10546875, + "rewards/margins": 7.015625, + "rewards/rejected": -4.8984375, + "step": 2301 + }, + { + "epoch": 0.45676868892306166, + "grad_norm": 33.295821435515144, + "learning_rate": 6.942049926076369e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.1328125, + "logps/chosen": -1182.0, + "logps/rejected": -753.0, + "loss": 0.4942, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.1767578125, + "rewards/margins": 6.4296875, + "rewards/rejected": -4.26171875, + "step": 2302 + }, + { + "epoch": 0.4569671114638623, + "grad_norm": 35.21241984625857, + "learning_rate": 6.939097282598956e-07, + "logits/chosen": 3.953125, + "logits/rejected": 4.1015625, + "logps/chosen": -788.0, + "logps/rejected": -657.0, + "loss": 0.524, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.3310546875, + "rewards/margins": 6.49609375, + "rewards/rejected": -5.15625, + "step": 2303 + }, + { + "epoch": 0.4571655340046629, + "grad_norm": 31.806864372221376, + "learning_rate": 6.936143948812138e-07, + "logits/chosen": 4.0, + "logits/rejected": 3.84765625, + "logps/chosen": -922.0, + "logps/rejected": -639.5, + "loss": 0.4428, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.7265625, + "rewards/margins": 6.3515625, + "rewards/rejected": -4.62109375, + "step": 2304 + }, + { + "epoch": 0.4573639565454636, + "grad_norm": 33.39621535812236, + "learning_rate": 6.93318992613258e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.5859375, + "logps/chosen": -1051.0, + "logps/rejected": -1435.0, + "loss": 0.4871, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.28125, + "rewards/margins": 8.40625, + "rewards/rejected": -6.13671875, + "step": 2305 + }, + { + "epoch": 0.4575623790862642, + "grad_norm": 30.44536322076716, + "learning_rate": 6.930235215977272e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 4.1328125, + "logps/chosen": -917.0, + "logps/rejected": -682.0, + "loss": 0.4464, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.837890625, + "rewards/margins": 7.2890625, + "rewards/rejected": -4.45703125, + "step": 2306 + }, + { + "epoch": 0.45776080162706484, + "grad_norm": 29.32816974669742, + "learning_rate": 6.927279819763535e-07, + "logits/chosen": 4.609375, + "logits/rejected": 4.33203125, + "logps/chosen": -1125.0, + "logps/rejected": -684.5, + "loss": 0.3742, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3125, + "rewards/margins": 7.1875, + "rewards/rejected": -4.87890625, + "step": 2307 + }, + { + "epoch": 0.45795922416786544, + "grad_norm": 32.087843615563095, + "learning_rate": 6.924323738909024e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 3.94921875, + "logps/chosen": -796.0, + "logps/rejected": -537.0, + "loss": 0.4067, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.56640625, + "rewards/margins": 6.828125, + "rewards/rejected": -5.265625, + "step": 2308 + }, + { + "epoch": 0.4581576467086661, + "grad_norm": 34.47259959501853, + "learning_rate": 6.921366974831715e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 4.1484375, + "logps/chosen": -1039.0, + "logps/rejected": -731.0, + "loss": 0.5276, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.435546875, + "rewards/margins": 5.46875, + "rewards/rejected": -4.0390625, + "step": 2309 + }, + { + "epoch": 0.45835606924946676, + "grad_norm": 30.044384725343654, + "learning_rate": 6.918409528949916e-07, + "logits/chosen": 4.43359375, + "logits/rejected": 4.48046875, + "logps/chosen": -1013.0, + "logps/rejected": -723.5, + "loss": 0.5006, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.830810546875, + "rewards/margins": 6.4375, + "rewards/rejected": -4.62109375, + "step": 2310 + }, + { + "epoch": 0.45855449179026736, + "grad_norm": 30.21758324390376, + "learning_rate": 6.91545140268226e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.66796875, + "logps/chosen": -889.0, + "logps/rejected": -774.5, + "loss": 0.358, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.525390625, + "rewards/margins": 7.453125, + "rewards/rejected": -4.921875, + "step": 2311 + }, + { + "epoch": 0.458752914331068, + "grad_norm": 37.28452734283969, + "learning_rate": 6.912492597447712e-07, + "logits/chosen": 4.22265625, + "logits/rejected": 4.12109375, + "logps/chosen": -893.5, + "logps/rejected": -735.5, + "loss": 0.4509, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.908203125, + "rewards/margins": 6.2578125, + "rewards/rejected": -4.3515625, + "step": 2312 + }, + { + "epoch": 0.4589513368718686, + "grad_norm": 41.422761733848745, + "learning_rate": 6.909533114665555e-07, + "logits/chosen": 4.40234375, + "logits/rejected": 4.4296875, + "logps/chosen": -997.0, + "logps/rejected": -668.0, + "loss": 0.4514, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.53515625, + "rewards/margins": 7.0234375, + "rewards/rejected": -5.50390625, + "step": 2313 + }, + { + "epoch": 0.4591497594126693, + "grad_norm": 31.19265459358581, + "learning_rate": 6.906572955755399e-07, + "logits/chosen": 4.3828125, + "logits/rejected": 4.1953125, + "logps/chosen": -888.0, + "logps/rejected": -805.0, + "loss": 0.5106, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.2664794921875, + "rewards/margins": 5.8359375, + "rewards/rejected": -4.57421875, + "step": 2314 + }, + { + "epoch": 0.45934818195346994, + "grad_norm": 22.59865880303454, + "learning_rate": 6.903612122137185e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 3.96484375, + "logps/chosen": -1184.0, + "logps/rejected": -725.0, + "loss": 0.2568, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.96484375, + "rewards/margins": 8.828125, + "rewards/rejected": -5.875, + "step": 2315 + }, + { + "epoch": 0.45954660449427054, + "grad_norm": 27.588331487412795, + "learning_rate": 6.900650615231166e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.234375, + "logps/chosen": -1131.0, + "logps/rejected": -1422.0, + "loss": 0.5322, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.766357421875, + "rewards/margins": 8.25, + "rewards/rejected": -6.484375, + "step": 2316 + }, + { + "epoch": 0.4597450270350712, + "grad_norm": 35.048909522564685, + "learning_rate": 6.897688436457926e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.37890625, + "logps/chosen": -729.0, + "logps/rejected": -780.0, + "loss": 0.4362, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.169921875, + "rewards/margins": 7.59375, + "rewards/rejected": -4.4296875, + "step": 2317 + }, + { + "epoch": 0.4599434495758718, + "grad_norm": 34.660851694405174, + "learning_rate": 6.894725587238372e-07, + "logits/chosen": 3.84375, + "logits/rejected": 3.68359375, + "logps/chosen": -1020.0, + "logps/rejected": -682.0, + "loss": 0.3527, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.453125, + "rewards/margins": 6.859375, + "rewards/rejected": -4.3984375, + "step": 2318 + }, + { + "epoch": 0.46014187211667246, + "grad_norm": 31.828211282386356, + "learning_rate": 6.891762068993729e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 4.0546875, + "logps/chosen": -789.0, + "logps/rejected": -1077.5, + "loss": 0.4114, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.8203125, + "rewards/margins": 6.93359375, + "rewards/rejected": -5.1171875, + "step": 2319 + }, + { + "epoch": 0.4603402946574731, + "grad_norm": 33.097478392730416, + "learning_rate": 6.888797883145544e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.484375, + "logps/chosen": -851.0, + "logps/rejected": -498.0, + "loss": 0.3493, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.154296875, + "rewards/margins": 5.69140625, + "rewards/rejected": -3.53515625, + "step": 2320 + }, + { + "epoch": 0.4605387171982737, + "grad_norm": 29.84352847362822, + "learning_rate": 6.885833031115684e-07, + "logits/chosen": 4.203125, + "logits/rejected": 3.8828125, + "logps/chosen": -1021.0, + "logps/rejected": -775.0, + "loss": 0.3765, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.142578125, + "rewards/margins": 8.4921875, + "rewards/rejected": -5.3515625, + "step": 2321 + }, + { + "epoch": 0.4607371397390744, + "grad_norm": 28.537996169350144, + "learning_rate": 6.882867514326335e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 3.56640625, + "logps/chosen": -1084.0, + "logps/rejected": -673.0, + "loss": 0.4529, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.56640625, + "rewards/margins": 6.640625, + "rewards/rejected": -4.06640625, + "step": 2322 + }, + { + "epoch": 0.460935562279875, + "grad_norm": 30.008224684306928, + "learning_rate": 6.879901334200005e-07, + "logits/chosen": 4.4921875, + "logits/rejected": 4.2109375, + "logps/chosen": -1196.0, + "logps/rejected": -607.0, + "loss": 0.3123, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.35546875, + "rewards/margins": 8.28125, + "rewards/rejected": -4.9375, + "step": 2323 + }, + { + "epoch": 0.46113398482067564, + "grad_norm": 31.970089990114452, + "learning_rate": 6.876934492159515e-07, + "logits/chosen": 4.36328125, + "logits/rejected": 4.36328125, + "logps/chosen": -879.0, + "logps/rejected": -814.0, + "loss": 0.4092, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.84814453125, + "rewards/margins": 6.7421875, + "rewards/rejected": -4.91015625, + "step": 2324 + }, + { + "epoch": 0.46133240736147624, + "grad_norm": 24.765964662262906, + "learning_rate": 6.873966989628009e-07, + "logits/chosen": 3.8046875, + "logits/rejected": 3.89453125, + "logps/chosen": -980.0, + "logps/rejected": -678.0, + "loss": 0.2744, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0234375, + "rewards/margins": 7.828125, + "rewards/rejected": -4.796875, + "step": 2325 + }, + { + "epoch": 0.4615308299022769, + "grad_norm": 32.01580736773898, + "learning_rate": 6.870998828028944e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 4.10546875, + "logps/chosen": -930.0, + "logps/rejected": -713.5, + "loss": 0.5119, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.873046875, + "rewards/margins": 5.90625, + "rewards/rejected": -4.0390625, + "step": 2326 + }, + { + "epoch": 0.46172925244307755, + "grad_norm": 34.14253477147044, + "learning_rate": 6.868030008786094e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.8125, + "logps/chosen": -896.5, + "logps/rejected": -827.0, + "loss": 0.2781, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.345703125, + "rewards/margins": 9.5078125, + "rewards/rejected": -7.1640625, + "step": 2327 + }, + { + "epoch": 0.46192767498387816, + "grad_norm": 32.02800733169996, + "learning_rate": 6.865060533323551e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 4.13671875, + "logps/chosen": -710.0, + "logps/rejected": -1154.0, + "loss": 0.3808, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.755859375, + "rewards/margins": 8.4453125, + "rewards/rejected": -6.68359375, + "step": 2328 + }, + { + "epoch": 0.4621260975246788, + "grad_norm": 27.965231783817305, + "learning_rate": 6.862090403065717e-07, + "logits/chosen": 3.62890625, + "logits/rejected": 3.4140625, + "logps/chosen": -1432.0, + "logps/rejected": -849.0, + "loss": 0.2227, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.05078125, + "rewards/margins": 8.8671875, + "rewards/rejected": -5.8203125, + "step": 2329 + }, + { + "epoch": 0.4623245200654794, + "grad_norm": 24.521746125835165, + "learning_rate": 6.859119619437311e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.2578125, + "logps/chosen": -1052.0, + "logps/rejected": -761.0, + "loss": 0.455, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.072265625, + "rewards/margins": 7.5078125, + "rewards/rejected": -5.42578125, + "step": 2330 + }, + { + "epoch": 0.4625229426062801, + "grad_norm": 34.59137776308569, + "learning_rate": 6.856148183863367e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.0390625, + "logps/chosen": -1120.0, + "logps/rejected": -692.0, + "loss": 0.4423, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8427734375, + "rewards/margins": 6.484375, + "rewards/rejected": -4.64453125, + "step": 2331 + }, + { + "epoch": 0.46272136514708073, + "grad_norm": 25.39059778870808, + "learning_rate": 6.853176097769228e-07, + "logits/chosen": 3.84375, + "logits/rejected": 3.484375, + "logps/chosen": -1462.0, + "logps/rejected": -696.0, + "loss": 0.2902, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.66015625, + "rewards/margins": 8.5078125, + "rewards/rejected": -4.85546875, + "step": 2332 + }, + { + "epoch": 0.46291978768788133, + "grad_norm": 27.352099211833067, + "learning_rate": 6.850203362580553e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.16796875, + "logps/chosen": -1215.0, + "logps/rejected": -809.5, + "loss": 0.368, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.4765625, + "rewards/margins": 8.640625, + "rewards/rejected": -5.134765625, + "step": 2333 + }, + { + "epoch": 0.463118210228682, + "grad_norm": 35.19248664453982, + "learning_rate": 6.847229979723308e-07, + "logits/chosen": 4.078125, + "logits/rejected": 3.93359375, + "logps/chosen": -1300.5, + "logps/rejected": -682.5, + "loss": 0.4824, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3232421875, + "rewards/margins": 7.1171875, + "rewards/rejected": -4.796875, + "step": 2334 + }, + { + "epoch": 0.4633166327694826, + "grad_norm": 27.51389321641469, + "learning_rate": 6.844255950623775e-07, + "logits/chosen": 3.91015625, + "logits/rejected": 3.87109375, + "logps/chosen": -1010.0, + "logps/rejected": -626.0, + "loss": 0.3104, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.736328125, + "rewards/margins": 8.7578125, + "rewards/rejected": -6.01171875, + "step": 2335 + }, + { + "epoch": 0.46351505531028325, + "grad_norm": 33.577706126982164, + "learning_rate": 6.841281276708538e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.08203125, + "logps/chosen": -832.5, + "logps/rejected": -694.0, + "loss": 0.2717, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.74609375, + "rewards/margins": 17.984375, + "rewards/rejected": -15.28125, + "step": 2336 + }, + { + "epoch": 0.4637134778510839, + "grad_norm": 42.137725556122795, + "learning_rate": 6.8383059594045e-07, + "logits/chosen": 3.6015625, + "logits/rejected": 3.8828125, + "logps/chosen": -794.5, + "logps/rejected": -583.5, + "loss": 0.4958, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.951171875, + "rewards/margins": 6.0234375, + "rewards/rejected": -4.078125, + "step": 2337 + }, + { + "epoch": 0.4639119003918845, + "grad_norm": 39.291217257866855, + "learning_rate": 6.835330000138864e-07, + "logits/chosen": 3.5546875, + "logits/rejected": 3.5078125, + "logps/chosen": -849.0, + "logps/rejected": -591.5, + "loss": 0.4299, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.05859375, + "rewards/margins": 7.1953125, + "rewards/rejected": -5.12109375, + "step": 2338 + }, + { + "epoch": 0.46411032293268517, + "grad_norm": 37.005562224904345, + "learning_rate": 6.832353400339149e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.12890625, + "logps/chosen": -1501.0, + "logps/rejected": -735.5, + "loss": 0.4673, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.55078125, + "rewards/margins": 6.1044921875, + "rewards/rejected": -4.55078125, + "step": 2339 + }, + { + "epoch": 0.4643087454734858, + "grad_norm": 32.629075753641196, + "learning_rate": 6.829376161433175e-07, + "logits/chosen": 4.359375, + "logits/rejected": 4.375, + "logps/chosen": -734.0, + "logps/rejected": -726.0, + "loss": 0.3731, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.8125, + "rewards/margins": 7.7421875, + "rewards/rejected": -5.9140625, + "step": 2340 + }, + { + "epoch": 0.46450716801428643, + "grad_norm": 34.49700802315222, + "learning_rate": 6.826398284849069e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 3.94921875, + "logps/chosen": -989.0, + "logps/rejected": -753.5, + "loss": 0.3472, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4794921875, + "rewards/margins": 7.921875, + "rewards/rejected": -5.4453125, + "step": 2341 + }, + { + "epoch": 0.46470559055508703, + "grad_norm": 62.318084292326816, + "learning_rate": 6.823419772015266e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 3.94140625, + "logps/chosen": -742.0, + "logps/rejected": -598.0, + "loss": 0.6241, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.021240234375, + "rewards/margins": 5.3046875, + "rewards/rejected": -4.26171875, + "step": 2342 + }, + { + "epoch": 0.4649040130958877, + "grad_norm": 35.635618145887754, + "learning_rate": 6.820440624360509e-07, + "logits/chosen": 3.70703125, + "logits/rejected": 3.8828125, + "logps/chosen": -877.5, + "logps/rejected": -630.5, + "loss": 0.4751, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.568115234375, + "rewards/margins": 6.2890625, + "rewards/rejected": -4.72265625, + "step": 2343 + }, + { + "epoch": 0.46510243563668835, + "grad_norm": 31.81010490781726, + "learning_rate": 6.817460843313839e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.5703125, + "logps/chosen": -1018.5, + "logps/rejected": -855.0, + "loss": 0.5515, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.81103515625, + "rewards/margins": 12.734375, + "rewards/rejected": -11.9140625, + "step": 2344 + }, + { + "epoch": 0.46530085817748895, + "grad_norm": 24.80178964194352, + "learning_rate": 6.814480430304604e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 4.453125, + "logps/chosen": -1419.0, + "logps/rejected": -782.5, + "loss": 0.2149, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.98828125, + "rewards/margins": 10.3671875, + "rewards/rejected": -6.390625, + "step": 2345 + }, + { + "epoch": 0.4654992807182896, + "grad_norm": 33.92176990559223, + "learning_rate": 6.811499386762458e-07, + "logits/chosen": 4.37109375, + "logits/rejected": 4.22265625, + "logps/chosen": -666.5, + "logps/rejected": -479.5, + "loss": 0.4072, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.046875, + "rewards/margins": 6.7421875, + "rewards/rejected": -4.69140625, + "step": 2346 + }, + { + "epoch": 0.4656977032590902, + "grad_norm": 37.80944756288062, + "learning_rate": 6.80851771411735e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.8984375, + "logps/chosen": -932.0, + "logps/rejected": -649.0, + "loss": 0.3922, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.28759765625, + "rewards/margins": 7.9296875, + "rewards/rejected": -5.6484375, + "step": 2347 + }, + { + "epoch": 0.46589612579989087, + "grad_norm": 26.586978400233445, + "learning_rate": 6.80553541379954e-07, + "logits/chosen": 3.78125, + "logits/rejected": 3.86328125, + "logps/chosen": -1172.0, + "logps/rejected": -707.5, + "loss": 0.4538, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.63671875, + "rewards/margins": 7.04296875, + "rewards/rejected": -4.40625, + "step": 2348 + }, + { + "epoch": 0.46609454834069153, + "grad_norm": 36.193399754991034, + "learning_rate": 6.802552487239583e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.17578125, + "logps/chosen": -732.5, + "logps/rejected": -495.0, + "loss": 0.649, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.431640625, + "rewards/margins": 4.03515625, + "rewards/rejected": -3.607421875, + "step": 2349 + }, + { + "epoch": 0.46629297088149213, + "grad_norm": 32.97066280986562, + "learning_rate": 6.799568935868334e-07, + "logits/chosen": 4.546875, + "logits/rejected": 4.44140625, + "logps/chosen": -848.0, + "logps/rejected": -1361.0, + "loss": 0.5541, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.23046875, + "rewards/margins": 6.6953125, + "rewards/rejected": -4.46484375, + "step": 2350 + }, + { + "epoch": 0.4664913934222928, + "grad_norm": 38.66261612833603, + "learning_rate": 6.796584761116952e-07, + "logits/chosen": 4.078125, + "logits/rejected": 3.81640625, + "logps/chosen": -1249.0, + "logps/rejected": -760.0, + "loss": 0.4228, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.48193359375, + "rewards/margins": 7.1328125, + "rewards/rejected": -5.640625, + "step": 2351 + }, + { + "epoch": 0.4666898159630934, + "grad_norm": 31.900922037337082, + "learning_rate": 6.793599964416891e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.390625, + "logps/chosen": -915.5, + "logps/rejected": -513.0, + "loss": 0.3873, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.56640625, + "rewards/margins": 6.4140625, + "rewards/rejected": -3.84375, + "step": 2352 + }, + { + "epoch": 0.46688823850389405, + "grad_norm": 30.30425207530361, + "learning_rate": 6.790614547199906e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.37890625, + "logps/chosen": -942.0, + "logps/rejected": -1619.0, + "loss": 0.3111, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.65625, + "rewards/margins": 10.75, + "rewards/rejected": -8.1015625, + "step": 2353 + }, + { + "epoch": 0.46708666104469465, + "grad_norm": 32.88129313527457, + "learning_rate": 6.787628510898048e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 4.17578125, + "logps/chosen": -1180.0, + "logps/rejected": -880.0, + "loss": 0.4611, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.05078125, + "rewards/margins": 6.18359375, + "rewards/rejected": -4.136474609375, + "step": 2354 + }, + { + "epoch": 0.4672850835854953, + "grad_norm": 33.22106056149221, + "learning_rate": 6.784641856943666e-07, + "logits/chosen": 3.8828125, + "logits/rejected": 3.6796875, + "logps/chosen": -1326.0, + "logps/rejected": -711.0, + "loss": 0.3422, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.0419921875, + "rewards/margins": 8.09375, + "rewards/rejected": -5.046875, + "step": 2355 + }, + { + "epoch": 0.46748350612629597, + "grad_norm": 30.478735928675675, + "learning_rate": 6.781654586769406e-07, + "logits/chosen": 3.640625, + "logits/rejected": 3.6171875, + "logps/chosen": -633.0, + "logps/rejected": -549.0, + "loss": 0.5188, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.228515625, + "rewards/margins": 5.84375, + "rewards/rejected": -3.62109375, + "step": 2356 + }, + { + "epoch": 0.46768192866709657, + "grad_norm": 30.798277743199574, + "learning_rate": 6.778666701808205e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 4.0, + "logps/chosen": -1045.0, + "logps/rejected": -713.0, + "loss": 0.4653, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.4951171875, + "rewards/margins": 7.3125, + "rewards/rejected": -4.828125, + "step": 2357 + }, + { + "epoch": 0.4678803512078972, + "grad_norm": 28.349186938420214, + "learning_rate": 6.775678203493302e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 4.4609375, + "logps/chosen": -1349.0, + "logps/rejected": -970.0, + "loss": 0.3324, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.015625, + "rewards/margins": 9.28125, + "rewards/rejected": -6.2578125, + "step": 2358 + }, + { + "epoch": 0.46807877374869783, + "grad_norm": 35.141687145130874, + "learning_rate": 6.772689093258224e-07, + "logits/chosen": 3.765625, + "logits/rejected": 3.7421875, + "logps/chosen": -915.0, + "logps/rejected": -735.0, + "loss": 0.5223, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.75927734375, + "rewards/margins": 5.72265625, + "rewards/rejected": -3.9765625, + "step": 2359 + }, + { + "epoch": 0.4682771962894985, + "grad_norm": 30.494780503400047, + "learning_rate": 6.769699372536794e-07, + "logits/chosen": 4.19921875, + "logits/rejected": 3.9453125, + "logps/chosen": -903.0, + "logps/rejected": -740.0, + "loss": 0.4141, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.92333984375, + "rewards/margins": 7.4921875, + "rewards/rejected": -5.5703125, + "step": 2360 + }, + { + "epoch": 0.46847561883029915, + "grad_norm": 30.328695104946757, + "learning_rate": 6.766709042763132e-07, + "logits/chosen": 3.68359375, + "logits/rejected": 3.77734375, + "logps/chosen": -944.0, + "logps/rejected": -1011.5, + "loss": 0.6019, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.0, + "rewards/margins": 5.27734375, + "rewards/rejected": -3.283203125, + "step": 2361 + }, + { + "epoch": 0.46867404137109975, + "grad_norm": 29.550674546501142, + "learning_rate": 6.763718105371641e-07, + "logits/chosen": 4.4453125, + "logits/rejected": 4.2421875, + "logps/chosen": -1146.0, + "logps/rejected": -884.0, + "loss": 0.4517, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.775390625, + "rewards/margins": 8.453125, + "rewards/rejected": -5.66796875, + "step": 2362 + }, + { + "epoch": 0.4688724639119004, + "grad_norm": 30.37838290067318, + "learning_rate": 6.760726561797023e-07, + "logits/chosen": 4.6875, + "logits/rejected": 4.5546875, + "logps/chosen": -968.0, + "logps/rejected": -735.0, + "loss": 0.3885, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.576171875, + "rewards/margins": 7.06640625, + "rewards/rejected": -4.484375, + "step": 2363 + }, + { + "epoch": 0.469070886452701, + "grad_norm": 29.75723298250317, + "learning_rate": 6.757734413474267e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 4.03125, + "logps/chosen": -818.0, + "logps/rejected": -700.5, + "loss": 0.6464, + "rewards/accuracies": 0.65625, + "rewards/chosen": 1.69140625, + "rewards/margins": 3.673828125, + "rewards/rejected": -1.986328125, + "step": 2364 + }, + { + "epoch": 0.46926930899350167, + "grad_norm": 32.049909957876054, + "learning_rate": 6.754741661838656e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 3.98828125, + "logps/chosen": -780.0, + "logps/rejected": -700.0, + "loss": 0.3444, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.04296875, + "rewards/margins": 7.6484375, + "rewards/rejected": -5.59375, + "step": 2365 + }, + { + "epoch": 0.4694677315343023, + "grad_norm": 29.8183718161409, + "learning_rate": 6.751748308325758e-07, + "logits/chosen": 4.34765625, + "logits/rejected": 4.3828125, + "logps/chosen": -1043.0, + "logps/rejected": -699.0, + "loss": 0.462, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.33984375, + "rewards/margins": 6.62890625, + "rewards/rejected": -4.2890625, + "step": 2366 + }, + { + "epoch": 0.4696661540751029, + "grad_norm": 28.244277345802423, + "learning_rate": 6.74875435437143e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.515625, + "logps/chosen": -729.0, + "logps/rejected": -712.0, + "loss": 0.4864, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.55078125, + "rewards/margins": 6.5390625, + "rewards/rejected": -4.984375, + "step": 2367 + }, + { + "epoch": 0.4698645766159036, + "grad_norm": 31.930837048323163, + "learning_rate": 6.745759801411822e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.21484375, + "logps/chosen": -910.0, + "logps/rejected": -701.0, + "loss": 0.4264, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.14453125, + "rewards/margins": 6.14453125, + "rewards/rejected": -4.0078125, + "step": 2368 + }, + { + "epoch": 0.4700629991567042, + "grad_norm": 37.14174045074016, + "learning_rate": 6.742764650883366e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.91796875, + "logps/chosen": -1015.0, + "logps/rejected": -787.0, + "loss": 0.3969, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.310546875, + "rewards/margins": 7.1015625, + "rewards/rejected": -4.7890625, + "step": 2369 + }, + { + "epoch": 0.47026142169750484, + "grad_norm": 37.809812152235025, + "learning_rate": 6.739768904222782e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 4.2421875, + "logps/chosen": -1007.0, + "logps/rejected": -1061.0, + "loss": 0.4486, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.4423828125, + "rewards/margins": 13.8515625, + "rewards/rejected": -12.4453125, + "step": 2370 + }, + { + "epoch": 0.47045984423830545, + "grad_norm": 29.182241382742966, + "learning_rate": 6.736772562867076e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.421875, + "logps/chosen": -845.0, + "logps/rejected": -641.5, + "loss": 0.4623, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.3125, + "rewards/margins": 6.7109375, + "rewards/rejected": -4.3984375, + "step": 2371 + }, + { + "epoch": 0.4706582667791061, + "grad_norm": 31.695442622708587, + "learning_rate": 6.733775628253542e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.18359375, + "logps/chosen": -881.0, + "logps/rejected": -577.0, + "loss": 0.4483, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.2421875, + "rewards/margins": 6.5703125, + "rewards/rejected": -4.3203125, + "step": 2372 + }, + { + "epoch": 0.47085668931990676, + "grad_norm": 33.82078773115428, + "learning_rate": 6.730778101819754e-07, + "logits/chosen": 4.09375, + "logits/rejected": 3.9609375, + "logps/chosen": -1369.0, + "logps/rejected": -853.0, + "loss": 0.5036, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.10546875, + "rewards/margins": 6.53125, + "rewards/rejected": -4.421875, + "step": 2373 + }, + { + "epoch": 0.47105511186070737, + "grad_norm": 36.805396477422235, + "learning_rate": 6.727779985003575e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 4.0859375, + "logps/chosen": -1061.0, + "logps/rejected": -838.0, + "loss": 0.5417, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.984375, + "rewards/margins": 5.6015625, + "rewards/rejected": -3.6015625, + "step": 2374 + }, + { + "epoch": 0.471253534401508, + "grad_norm": 36.675645986304204, + "learning_rate": 6.724781279243146e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.80859375, + "logps/chosen": -821.0, + "logps/rejected": -1662.0, + "loss": 0.5139, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.85546875, + "rewards/margins": 9.25390625, + "rewards/rejected": -7.38671875, + "step": 2375 + }, + { + "epoch": 0.4714519569423086, + "grad_norm": 32.64364083785157, + "learning_rate": 6.721781985976892e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.40234375, + "logps/chosen": -663.0, + "logps/rejected": -1197.5, + "loss": 0.6222, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.1962890625, + "rewards/margins": 7.90625, + "rewards/rejected": -6.703125, + "step": 2376 + }, + { + "epoch": 0.4716503794831093, + "grad_norm": 34.654478824588175, + "learning_rate": 6.718782106643523e-07, + "logits/chosen": 4.33984375, + "logits/rejected": 4.3984375, + "logps/chosen": -1226.0, + "logps/rejected": -959.0, + "loss": 0.3803, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.759765625, + "rewards/margins": 8.9375, + "rewards/rejected": -6.171875, + "step": 2377 + }, + { + "epoch": 0.47184880202390994, + "grad_norm": 33.75207444245168, + "learning_rate": 6.715781642682025e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.31640625, + "logps/chosen": -1140.0, + "logps/rejected": -932.0, + "loss": 0.3862, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.15234375, + "rewards/margins": 9.3671875, + "rewards/rejected": -7.2109375, + "step": 2378 + }, + { + "epoch": 0.47204722456471054, + "grad_norm": 32.34032175304754, + "learning_rate": 6.712780595531673e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 3.63671875, + "logps/chosen": -880.0, + "logps/rejected": -871.0, + "loss": 0.533, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.9375, + "rewards/margins": 5.8359375, + "rewards/rejected": -3.888671875, + "step": 2379 + }, + { + "epoch": 0.4722456471055112, + "grad_norm": 32.13472146409526, + "learning_rate": 6.70977896663201e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 4.390625, + "logps/chosen": -1161.0, + "logps/rejected": -1339.0, + "loss": 0.4826, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.80859375, + "rewards/margins": 8.4453125, + "rewards/rejected": -5.640625, + "step": 2380 + }, + { + "epoch": 0.4724440696463118, + "grad_norm": 26.421051169741013, + "learning_rate": 6.706776757422868e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 3.984375, + "logps/chosen": -1161.0, + "logps/rejected": -656.0, + "loss": 0.5008, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.87890625, + "rewards/margins": 5.20703125, + "rewards/rejected": -4.3203125, + "step": 2381 + }, + { + "epoch": 0.47264249218711246, + "grad_norm": 24.978273905814536, + "learning_rate": 6.703773969344351e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 3.89453125, + "logps/chosen": -875.0, + "logps/rejected": -2016.0, + "loss": 0.3167, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.681640625, + "rewards/margins": 11.515625, + "rewards/rejected": -8.8515625, + "step": 2382 + }, + { + "epoch": 0.47284091472791306, + "grad_norm": 33.85435832217097, + "learning_rate": 6.700770603836843e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 4.03515625, + "logps/chosen": -785.5, + "logps/rejected": -1147.5, + "loss": 0.3507, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.962890625, + "rewards/margins": 9.6796875, + "rewards/rejected": -7.7109375, + "step": 2383 + }, + { + "epoch": 0.4730393372687137, + "grad_norm": 30.846767262671783, + "learning_rate": 6.697766662341008e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 3.84375, + "logps/chosen": -1058.0, + "logps/rejected": -635.5, + "loss": 0.3957, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.451171875, + "rewards/margins": 5.79296875, + "rewards/rejected": -4.3515625, + "step": 2384 + }, + { + "epoch": 0.4732377598095144, + "grad_norm": 27.332844948493413, + "learning_rate": 6.69476214629778e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 4.03515625, + "logps/chosen": -945.0, + "logps/rejected": -742.0, + "loss": 0.4451, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.06640625, + "rewards/margins": 6.37109375, + "rewards/rejected": -4.30078125, + "step": 2385 + }, + { + "epoch": 0.473436182350315, + "grad_norm": 40.66943286396265, + "learning_rate": 6.691757057148372e-07, + "logits/chosen": 3.8515625, + "logits/rejected": 3.7578125, + "logps/chosen": -923.0, + "logps/rejected": -1958.0, + "loss": 0.348, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.873046875, + "rewards/margins": 11.1484375, + "rewards/rejected": -9.2890625, + "step": 2386 + }, + { + "epoch": 0.47363460489111564, + "grad_norm": 26.257888787529758, + "learning_rate": 6.688751396334274e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.09375, + "logps/chosen": -696.0, + "logps/rejected": -923.0, + "loss": 0.3357, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.35546875, + "rewards/margins": 7.71875, + "rewards/rejected": -5.34375, + "step": 2387 + }, + { + "epoch": 0.47383302743191624, + "grad_norm": 37.9688939548456, + "learning_rate": 6.685745165297247e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 4.10546875, + "logps/chosen": -975.0, + "logps/rejected": -757.5, + "loss": 0.4365, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.24609375, + "rewards/margins": 7.0078125, + "rewards/rejected": -4.765625, + "step": 2388 + }, + { + "epoch": 0.4740314499727169, + "grad_norm": 30.41244920417217, + "learning_rate": 6.682738365479328e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 4.23828125, + "logps/chosen": -867.0, + "logps/rejected": -1533.0, + "loss": 0.4075, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.80859375, + "rewards/margins": 8.0234375, + "rewards/rejected": -6.22265625, + "step": 2389 + }, + { + "epoch": 0.47422987251351756, + "grad_norm": 29.816963245144315, + "learning_rate": 6.679730998322824e-07, + "logits/chosen": 4.484375, + "logits/rejected": 4.265625, + "logps/chosen": -838.5, + "logps/rejected": -665.5, + "loss": 0.3532, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.501953125, + "rewards/margins": 12.6640625, + "rewards/rejected": -10.1328125, + "step": 2390 + }, + { + "epoch": 0.47442829505431816, + "grad_norm": 33.87646674485644, + "learning_rate": 6.676723065270317e-07, + "logits/chosen": 3.71484375, + "logits/rejected": 3.546875, + "logps/chosen": -798.0, + "logps/rejected": -613.5, + "loss": 0.5557, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.5625, + "rewards/margins": 4.96484375, + "rewards/rejected": -3.39453125, + "step": 2391 + }, + { + "epoch": 0.4746267175951188, + "grad_norm": 38.51482825826127, + "learning_rate": 6.673714567764657e-07, + "logits/chosen": 3.625, + "logits/rejected": 4.11328125, + "logps/chosen": -1203.0, + "logps/rejected": -918.0, + "loss": 0.4018, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.611328125, + "rewards/margins": 8.45703125, + "rewards/rejected": -5.83203125, + "step": 2392 + }, + { + "epoch": 0.4748251401359194, + "grad_norm": 36.62635793620562, + "learning_rate": 6.670705507248969e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.1328125, + "logps/chosen": -955.0, + "logps/rejected": -723.0, + "loss": 0.4877, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.83203125, + "rewards/margins": 6.234375, + "rewards/rejected": -4.39453125, + "step": 2393 + }, + { + "epoch": 0.4750235626767201, + "grad_norm": 35.14517274407926, + "learning_rate": 6.667695885166642e-07, + "logits/chosen": 3.69140625, + "logits/rejected": 3.8515625, + "logps/chosen": -1379.0, + "logps/rejected": -982.0, + "loss": 0.3688, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.65234375, + "rewards/margins": 10.046875, + "rewards/rejected": -7.4140625, + "step": 2394 + }, + { + "epoch": 0.47522198521752074, + "grad_norm": 34.720901846907765, + "learning_rate": 6.664685702961344e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 3.85546875, + "logps/chosen": -783.0, + "logps/rejected": -722.0, + "loss": 0.4329, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.9296875, + "rewards/margins": 6.703125, + "rewards/rejected": -4.7734375, + "step": 2395 + }, + { + "epoch": 0.47542040775832134, + "grad_norm": 29.983449303742226, + "learning_rate": 6.661674962077002e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.81640625, + "logps/chosen": -990.0, + "logps/rejected": -841.5, + "loss": 0.3807, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0361328125, + "rewards/margins": 8.3046875, + "rewards/rejected": -6.26953125, + "step": 2396 + }, + { + "epoch": 0.475618830299122, + "grad_norm": 33.76307923349019, + "learning_rate": 6.658663663957815e-07, + "logits/chosen": 4.29296875, + "logits/rejected": 4.2578125, + "logps/chosen": -1933.0, + "logps/rejected": -814.0, + "loss": 0.345, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.294921875, + "rewards/margins": 7.984375, + "rewards/rejected": -7.6953125, + "step": 2397 + }, + { + "epoch": 0.4758172528399226, + "grad_norm": 38.24695795972548, + "learning_rate": 6.65565181004825e-07, + "logits/chosen": 3.921875, + "logits/rejected": 4.25, + "logps/chosen": -1029.0, + "logps/rejected": -856.5, + "loss": 0.4899, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.1640625, + "rewards/margins": 7.09375, + "rewards/rejected": -4.9296875, + "step": 2398 + }, + { + "epoch": 0.47601567538072326, + "grad_norm": 35.9272271196191, + "learning_rate": 6.65263940179304e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.08203125, + "logps/chosen": -1172.0, + "logps/rejected": -1907.0, + "loss": 0.3977, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.33203125, + "rewards/margins": 10.578125, + "rewards/rejected": -8.2421875, + "step": 2399 + }, + { + "epoch": 0.47621409792152386, + "grad_norm": 26.289758358244853, + "learning_rate": 6.649626440637185e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 4.3515625, + "logps/chosen": -863.0, + "logps/rejected": -659.0, + "loss": 0.3423, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.05859375, + "rewards/margins": 7.859375, + "rewards/rejected": -4.8125, + "step": 2400 + }, + { + "epoch": 0.4764125204623245, + "grad_norm": 31.757080244933793, + "learning_rate": 6.646612928025947e-07, + "logits/chosen": 3.6953125, + "logits/rejected": 3.91796875, + "logps/chosen": -962.0, + "logps/rejected": -685.5, + "loss": 0.3869, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3974609375, + "rewards/margins": 6.8671875, + "rewards/rejected": -4.4609375, + "step": 2401 + }, + { + "epoch": 0.4766109430031252, + "grad_norm": 31.289232621497387, + "learning_rate": 6.643598865404853e-07, + "logits/chosen": 4.09375, + "logits/rejected": 3.88671875, + "logps/chosen": -1297.0, + "logps/rejected": -897.0, + "loss": 0.3729, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.728515625, + "rewards/margins": 7.65625, + "rewards/rejected": -4.94921875, + "step": 2402 + }, + { + "epoch": 0.4768093655439258, + "grad_norm": 44.26528223190179, + "learning_rate": 6.640584254219699e-07, + "logits/chosen": 3.7421875, + "logits/rejected": 3.890625, + "logps/chosen": -1098.0, + "logps/rejected": -751.0, + "loss": 0.4631, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.517578125, + "rewards/margins": 6.453125, + "rewards/rejected": -3.9296875, + "step": 2403 + }, + { + "epoch": 0.47700778808472644, + "grad_norm": 35.620174176702264, + "learning_rate": 6.637569095916538e-07, + "logits/chosen": 3.46875, + "logits/rejected": 3.453125, + "logps/chosen": -873.0, + "logps/rejected": -951.5, + "loss": 0.5304, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.6015625, + "rewards/margins": 6.6328125, + "rewards/rejected": -5.05078125, + "step": 2404 + }, + { + "epoch": 0.47720621062552704, + "grad_norm": 31.16998904495971, + "learning_rate": 6.634553391941689e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.05859375, + "logps/chosen": -601.5, + "logps/rejected": -484.5, + "loss": 0.4569, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6884765625, + "rewards/margins": 4.87109375, + "rewards/rejected": -3.1875, + "step": 2405 + }, + { + "epoch": 0.4774046331663277, + "grad_norm": 25.65520984927983, + "learning_rate": 6.631537143741732e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.2265625, + "logps/chosen": -995.0, + "logps/rejected": -624.0, + "loss": 0.3557, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.671875, + "rewards/margins": 7.2265625, + "rewards/rejected": -4.56640625, + "step": 2406 + }, + { + "epoch": 0.47760305570712835, + "grad_norm": 28.406897906325888, + "learning_rate": 6.628520352763506e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.0390625, + "logps/chosen": -1065.0, + "logps/rejected": -675.0, + "loss": 0.4353, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.400390625, + "rewards/margins": 7.2890625, + "rewards/rejected": -4.88671875, + "step": 2407 + }, + { + "epoch": 0.47780147824792896, + "grad_norm": 39.94902541987662, + "learning_rate": 6.625503020454114e-07, + "logits/chosen": 3.984375, + "logits/rejected": 4.21875, + "logps/chosen": -920.0, + "logps/rejected": -719.0, + "loss": 0.5522, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.015625, + "rewards/margins": 5.13671875, + "rewards/rejected": -3.1171875, + "step": 2408 + }, + { + "epoch": 0.4779999007887296, + "grad_norm": 34.43053827199236, + "learning_rate": 6.622485148260915e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 3.76171875, + "logps/chosen": -1179.0, + "logps/rejected": -602.0, + "loss": 0.3956, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.283203125, + "rewards/margins": 6.6796875, + "rewards/rejected": -4.40234375, + "step": 2409 + }, + { + "epoch": 0.4781983233295302, + "grad_norm": 29.37388305593983, + "learning_rate": 6.61946673763153e-07, + "logits/chosen": 4.515625, + "logits/rejected": 4.3828125, + "logps/chosen": -1090.0, + "logps/rejected": -809.0, + "loss": 0.3765, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4609375, + "rewards/margins": 13.4609375, + "rewards/rejected": -10.98046875, + "step": 2410 + }, + { + "epoch": 0.4783967458703309, + "grad_norm": 30.309938406599866, + "learning_rate": 6.616447790013836e-07, + "logits/chosen": 4.640625, + "logits/rejected": 4.3828125, + "logps/chosen": -1182.0, + "logps/rejected": -737.5, + "loss": 0.4279, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8818359375, + "rewards/margins": 7.4609375, + "rewards/rejected": -4.568359375, + "step": 2411 + }, + { + "epoch": 0.4785951684111315, + "grad_norm": 30.237943367920682, + "learning_rate": 6.613428306855969e-07, + "logits/chosen": 4.359375, + "logits/rejected": 4.3671875, + "logps/chosen": -1076.0, + "logps/rejected": -955.0, + "loss": 0.4866, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.3173828125, + "rewards/margins": 15.6796875, + "rewards/rejected": -13.390625, + "step": 2412 + }, + { + "epoch": 0.47879359095193214, + "grad_norm": 27.886496432274008, + "learning_rate": 6.610408289606321e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 3.9921875, + "logps/chosen": -1160.0, + "logps/rejected": -717.0, + "loss": 0.4446, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.40234375, + "rewards/margins": 7.21875, + "rewards/rejected": -5.828125, + "step": 2413 + }, + { + "epoch": 0.4789920134927328, + "grad_norm": 38.46981048143589, + "learning_rate": 6.607387739713541e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 3.85546875, + "logps/chosen": -1333.0, + "logps/rejected": -864.0, + "loss": 0.4339, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.134765625, + "rewards/margins": 7.46875, + "rewards/rejected": -5.33984375, + "step": 2414 + }, + { + "epoch": 0.4791904360335334, + "grad_norm": 33.27138690658631, + "learning_rate": 6.604366658626534e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.5078125, + "logps/chosen": -887.0, + "logps/rejected": -754.5, + "loss": 0.3979, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.90234375, + "rewards/margins": 7.9765625, + "rewards/rejected": -5.06640625, + "step": 2415 + }, + { + "epoch": 0.47938885857433405, + "grad_norm": 38.972112662977786, + "learning_rate": 6.601345047794457e-07, + "logits/chosen": 4.37109375, + "logits/rejected": 4.25, + "logps/chosen": -1006.0, + "logps/rejected": -910.0, + "loss": 0.4738, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.001953125, + "rewards/margins": 6.89453125, + "rewards/rejected": -4.88671875, + "step": 2416 + }, + { + "epoch": 0.47958728111513466, + "grad_norm": 35.0545357593261, + "learning_rate": 6.598322908666725e-07, + "logits/chosen": 4.14453125, + "logits/rejected": 4.24609375, + "logps/chosen": -1008.0, + "logps/rejected": -713.0, + "loss": 0.3756, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.271484375, + "rewards/margins": 8.1328125, + "rewards/rejected": -5.859375, + "step": 2417 + }, + { + "epoch": 0.4797857036559353, + "grad_norm": 31.086016864506504, + "learning_rate": 6.595300242693003e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.09765625, + "logps/chosen": -1049.0, + "logps/rejected": -890.0, + "loss": 0.4168, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.08984375, + "rewards/margins": 8.625, + "rewards/rejected": -6.546875, + "step": 2418 + }, + { + "epoch": 0.47998412619673597, + "grad_norm": 33.831567621375704, + "learning_rate": 6.592277051323209e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.13671875, + "logps/chosen": -1027.0, + "logps/rejected": -721.5, + "loss": 0.3586, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.294921875, + "rewards/margins": 8.0546875, + "rewards/rejected": -5.75, + "step": 2419 + }, + { + "epoch": 0.4801825487375366, + "grad_norm": 35.52689683849122, + "learning_rate": 6.589253336007517e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.953125, + "logps/chosen": -1441.0, + "logps/rejected": -867.5, + "loss": 0.3695, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.0927734375, + "rewards/margins": 6.73828125, + "rewards/rejected": -5.65625, + "step": 2420 + }, + { + "epoch": 0.48038097127833723, + "grad_norm": 31.69192473277105, + "learning_rate": 6.586229098196349e-07, + "logits/chosen": 4.6484375, + "logits/rejected": 4.34765625, + "logps/chosen": -1327.0, + "logps/rejected": -757.5, + "loss": 0.3371, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.033203125, + "rewards/margins": 8.09375, + "rewards/rejected": -5.06640625, + "step": 2421 + }, + { + "epoch": 0.48057939381913783, + "grad_norm": 33.56691205770096, + "learning_rate": 6.583204339340379e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 4.1171875, + "logps/chosen": -796.5, + "logps/rejected": -581.0, + "loss": 0.4425, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.927734375, + "rewards/margins": 6.4140625, + "rewards/rejected": -4.48046875, + "step": 2422 + }, + { + "epoch": 0.4807778163599385, + "grad_norm": 44.73703636337632, + "learning_rate": 6.580179060890526e-07, + "logits/chosen": 4.390625, + "logits/rejected": 4.328125, + "logps/chosen": -1152.0, + "logps/rejected": -678.75, + "loss": 0.4171, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.666015625, + "rewards/margins": 7.7421875, + "rewards/rejected": -5.068359375, + "step": 2423 + }, + { + "epoch": 0.48097623890073915, + "grad_norm": 39.137730354847285, + "learning_rate": 6.577153264297967e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 4.34765625, + "logps/chosen": -1099.0, + "logps/rejected": -647.0, + "loss": 0.4025, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.1640625, + "rewards/margins": 6.15234375, + "rewards/rejected": -4.0029296875, + "step": 2424 + }, + { + "epoch": 0.48117466144153975, + "grad_norm": 31.308702141532954, + "learning_rate": 6.57412695101412e-07, + "logits/chosen": 3.7265625, + "logits/rejected": 3.4296875, + "logps/chosen": -650.0, + "logps/rejected": -786.0, + "loss": 0.5096, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.892578125, + "rewards/margins": 5.87109375, + "rewards/rejected": -3.96923828125, + "step": 2425 + }, + { + "epoch": 0.4813730839823404, + "grad_norm": 32.246125451620024, + "learning_rate": 6.571100122490654e-07, + "logits/chosen": 3.79296875, + "logits/rejected": 3.7890625, + "logps/chosen": -927.0, + "logps/rejected": -1468.0, + "loss": 0.4286, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.990234375, + "rewards/margins": 9.109375, + "rewards/rejected": -7.09765625, + "step": 2426 + }, + { + "epoch": 0.481571506523141, + "grad_norm": 34.20900650831018, + "learning_rate": 6.568072780179486e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.45703125, + "logps/chosen": -1117.0, + "logps/rejected": -943.0, + "loss": 0.4539, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.234375, + "rewards/margins": 9.7421875, + "rewards/rejected": -6.5, + "step": 2427 + }, + { + "epoch": 0.48176992906394167, + "grad_norm": 33.5335311428675, + "learning_rate": 6.565044925532778e-07, + "logits/chosen": 3.921875, + "logits/rejected": 3.75390625, + "logps/chosen": -683.5, + "logps/rejected": -653.0, + "loss": 0.4362, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.869140625, + "rewards/margins": 17.55078125, + "rewards/rejected": -15.6953125, + "step": 2428 + }, + { + "epoch": 0.4819683516047423, + "grad_norm": 27.538681879851687, + "learning_rate": 6.562016560002937e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.1484375, + "logps/chosen": -784.0, + "logps/rejected": -553.0, + "loss": 0.3738, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.59326171875, + "rewards/margins": 7.08203125, + "rewards/rejected": -5.484375, + "step": 2429 + }, + { + "epoch": 0.48216677414554293, + "grad_norm": 41.24676805805146, + "learning_rate": 6.558987685042619e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.25, + "logps/chosen": -778.0, + "logps/rejected": -542.5, + "loss": 0.5468, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8388671875, + "rewards/margins": 4.0234375, + "rewards/rejected": -2.1875, + "step": 2430 + }, + { + "epoch": 0.4823651966863436, + "grad_norm": 33.04431213579719, + "learning_rate": 6.555958302104718e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 3.96484375, + "logps/chosen": -1004.0, + "logps/rejected": -700.0, + "loss": 0.3991, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.181640625, + "rewards/margins": 6.3046875, + "rewards/rejected": -4.12890625, + "step": 2431 + }, + { + "epoch": 0.4825636192271442, + "grad_norm": 36.39423059336554, + "learning_rate": 6.552928412642378e-07, + "logits/chosen": 4.14453125, + "logits/rejected": 4.31640625, + "logps/chosen": -807.0, + "logps/rejected": -1016.0, + "loss": 0.3497, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.265625, + "rewards/margins": 7.6796875, + "rewards/rejected": -5.40625, + "step": 2432 + }, + { + "epoch": 0.48276204176794485, + "grad_norm": 31.852780592066903, + "learning_rate": 6.549898018108981e-07, + "logits/chosen": 4.640625, + "logits/rejected": 4.4296875, + "logps/chosen": -1301.0, + "logps/rejected": -684.0, + "loss": 0.4104, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.06640625, + "rewards/margins": 8.40625, + "rewards/rejected": -5.33203125, + "step": 2433 + }, + { + "epoch": 0.48296046430874545, + "grad_norm": 31.907761167779967, + "learning_rate": 6.546867119958154e-07, + "logits/chosen": 3.79296875, + "logits/rejected": 3.81640625, + "logps/chosen": -816.0, + "logps/rejected": -1038.0, + "loss": 0.4389, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.736328125, + "rewards/margins": 7.44921875, + "rewards/rejected": -5.703125, + "step": 2434 + }, + { + "epoch": 0.4831588868495461, + "grad_norm": 26.41456218556389, + "learning_rate": 6.543835719643767e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 3.56640625, + "logps/chosen": -739.0, + "logps/rejected": -558.0, + "loss": 0.4861, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.509765625, + "rewards/margins": 6.24609375, + "rewards/rejected": -4.7421875, + "step": 2435 + }, + { + "epoch": 0.48335730939034677, + "grad_norm": 34.606677229195746, + "learning_rate": 6.540803818619924e-07, + "logits/chosen": 3.59375, + "logits/rejected": 3.265625, + "logps/chosen": -970.0, + "logps/rejected": -902.5, + "loss": 0.3828, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.115234375, + "rewards/margins": 8.296875, + "rewards/rejected": -6.171875, + "step": 2436 + }, + { + "epoch": 0.48355573193114737, + "grad_norm": 31.375658673798014, + "learning_rate": 6.537771418340981e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.06640625, + "logps/chosen": -813.0, + "logps/rejected": -518.0, + "loss": 0.4989, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.541015625, + "rewards/margins": 6.65625, + "rewards/rejected": -4.1171875, + "step": 2437 + }, + { + "epoch": 0.48375415447194803, + "grad_norm": 35.30539370297356, + "learning_rate": 6.534738520261521e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 4.15625, + "logps/chosen": -907.0, + "logps/rejected": -1000.0, + "loss": 0.4161, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.94140625, + "rewards/margins": 8.671875, + "rewards/rejected": -6.7265625, + "step": 2438 + }, + { + "epoch": 0.48395257701274863, + "grad_norm": 40.37080626766238, + "learning_rate": 6.531705125836373e-07, + "logits/chosen": 3.6484375, + "logits/rejected": 3.85546875, + "logps/chosen": -1131.0, + "logps/rejected": -759.0, + "loss": 0.2792, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.30078125, + "rewards/margins": 9.2421875, + "rewards/rejected": -6.9375, + "step": 2439 + }, + { + "epoch": 0.4841509995535493, + "grad_norm": 36.77103709101452, + "learning_rate": 6.528671236520604e-07, + "logits/chosen": 3.78125, + "logits/rejected": 3.66015625, + "logps/chosen": -1095.0, + "logps/rejected": -975.0, + "loss": 0.3775, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.7841796875, + "rewards/margins": 8.4453125, + "rewards/rejected": -6.6796875, + "step": 2440 + }, + { + "epoch": 0.4843494220943499, + "grad_norm": 32.646568237519475, + "learning_rate": 6.525636853769514e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.80078125, + "logps/chosen": -1059.0, + "logps/rejected": -897.0, + "loss": 0.4677, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.703125, + "rewards/margins": 7.703125, + "rewards/rejected": -6.00390625, + "step": 2441 + }, + { + "epoch": 0.48454784463515055, + "grad_norm": 34.98316686238892, + "learning_rate": 6.522601979038643e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 3.74609375, + "logps/chosen": -640.5, + "logps/rejected": -500.0, + "loss": 0.5105, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.7889404296875, + "rewards/margins": 5.33203125, + "rewards/rejected": -3.5546875, + "step": 2442 + }, + { + "epoch": 0.4847462671759512, + "grad_norm": 38.994466572250644, + "learning_rate": 6.51956661378377e-07, + "logits/chosen": 3.55859375, + "logits/rejected": 3.65625, + "logps/chosen": -989.0, + "logps/rejected": -653.0, + "loss": 0.5108, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.0615234375, + "rewards/margins": 6.01953125, + "rewards/rejected": -3.97265625, + "step": 2443 + }, + { + "epoch": 0.4849446897167518, + "grad_norm": 26.91384692940704, + "learning_rate": 6.516530759460901e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 3.96875, + "logps/chosen": -1392.0, + "logps/rejected": -819.0, + "loss": 0.3355, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.97265625, + "rewards/margins": 8.578125, + "rewards/rejected": -5.6015625, + "step": 2444 + }, + { + "epoch": 0.48514311225755247, + "grad_norm": 27.882547170525832, + "learning_rate": 6.513494417526284e-07, + "logits/chosen": 4.6875, + "logits/rejected": 4.4140625, + "logps/chosen": -1249.5, + "logps/rejected": -685.0, + "loss": 0.3928, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.607421875, + "rewards/margins": 6.57421875, + "rewards/rejected": -3.96875, + "step": 2445 + }, + { + "epoch": 0.48534153479835307, + "grad_norm": 26.855261037205437, + "learning_rate": 6.5104575894364e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 3.6328125, + "logps/chosen": -771.0, + "logps/rejected": -598.0, + "loss": 0.4576, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.10546875, + "rewards/margins": 6.515625, + "rewards/rejected": -4.4140625, + "step": 2446 + }, + { + "epoch": 0.4855399573391537, + "grad_norm": 32.47418219609228, + "learning_rate": 6.507420276647957e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.296875, + "logps/chosen": -1299.0, + "logps/rejected": -1650.0, + "loss": 0.3398, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6640625, + "rewards/margins": 10.921875, + "rewards/rejected": -8.2578125, + "step": 2447 + }, + { + "epoch": 0.4857383798799544, + "grad_norm": 37.99042681298759, + "learning_rate": 6.504382480617904e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 4.265625, + "logps/chosen": -1018.0, + "logps/rejected": -1150.0, + "loss": 0.4033, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.68359375, + "rewards/margins": 18.8359375, + "rewards/rejected": -16.125, + "step": 2448 + }, + { + "epoch": 0.485936802420755, + "grad_norm": 33.76011599318253, + "learning_rate": 6.501344202803414e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 4.1015625, + "logps/chosen": -689.5, + "logps/rejected": -620.5, + "loss": 0.5498, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.84228515625, + "rewards/margins": 6.546875, + "rewards/rejected": -5.703125, + "step": 2449 + }, + { + "epoch": 0.48613522496155565, + "grad_norm": 35.98450338788158, + "learning_rate": 6.498305444661898e-07, + "logits/chosen": 4.09375, + "logits/rejected": 4.0703125, + "logps/chosen": -1149.0, + "logps/rejected": -868.0, + "loss": 0.3967, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.33203125, + "rewards/margins": 7.0859375, + "rewards/rejected": -4.765625, + "step": 2450 + }, + { + "epoch": 0.48633364750235625, + "grad_norm": 35.07600388641933, + "learning_rate": 6.495266207650994e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.74609375, + "logps/chosen": -698.0, + "logps/rejected": -645.0, + "loss": 0.3349, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.841796875, + "rewards/margins": 8.015625, + "rewards/rejected": -6.1640625, + "step": 2451 + }, + { + "epoch": 0.4865320700431569, + "grad_norm": 29.559868776995266, + "learning_rate": 6.492226493228569e-07, + "logits/chosen": 3.890625, + "logits/rejected": 4.0390625, + "logps/chosen": -977.0, + "logps/rejected": -867.0, + "loss": 0.3932, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8447265625, + "rewards/margins": 8.53125, + "rewards/rejected": -6.7109375, + "step": 2452 + }, + { + "epoch": 0.48673049258395756, + "grad_norm": 36.144554001480735, + "learning_rate": 6.489186302852722e-07, + "logits/chosen": 3.60546875, + "logits/rejected": 3.69921875, + "logps/chosen": -728.0, + "logps/rejected": -648.0, + "loss": 0.4421, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.146484375, + "rewards/margins": 5.5625, + "rewards/rejected": -3.408203125, + "step": 2453 + }, + { + "epoch": 0.48692891512475817, + "grad_norm": 46.40676596618364, + "learning_rate": 6.486145637981775e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 3.84375, + "logps/chosen": -998.0, + "logps/rejected": -864.0, + "loss": 0.3977, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.787109375, + "rewards/margins": 7.890625, + "rewards/rejected": -6.10546875, + "step": 2454 + }, + { + "epoch": 0.4871273376655588, + "grad_norm": 34.41775465223971, + "learning_rate": 6.483104500074283e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 3.73046875, + "logps/chosen": -1084.0, + "logps/rejected": -750.0, + "loss": 0.3984, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7265625, + "rewards/margins": 7.125, + "rewards/rejected": -4.4140625, + "step": 2455 + }, + { + "epoch": 0.4873257602063594, + "grad_norm": 27.205611591161865, + "learning_rate": 6.480062890589025e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.32421875, + "logps/chosen": -769.5, + "logps/rejected": -1610.0, + "loss": 0.3654, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.02734375, + "rewards/margins": 11.1640625, + "rewards/rejected": -8.1484375, + "step": 2456 + }, + { + "epoch": 0.4875241827471601, + "grad_norm": 37.16930658877655, + "learning_rate": 6.477020810985009e-07, + "logits/chosen": 4.28515625, + "logits/rejected": 4.4296875, + "logps/chosen": -1039.0, + "logps/rejected": -1175.5, + "loss": 0.4386, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.076171875, + "rewards/margins": 7.1171875, + "rewards/rejected": -5.0546875, + "step": 2457 + }, + { + "epoch": 0.4877226052879607, + "grad_norm": 33.40775557865304, + "learning_rate": 6.473978262721463e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 3.98046875, + "logps/chosen": -1258.0, + "logps/rejected": -1684.0, + "loss": 0.2946, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.1708984375, + "rewards/margins": 10.828125, + "rewards/rejected": -8.65234375, + "step": 2458 + }, + { + "epoch": 0.48792102782876134, + "grad_norm": 30.462350002899502, + "learning_rate": 6.470935247257848e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.2890625, + "logps/chosen": -646.0, + "logps/rejected": -657.0, + "loss": 0.4261, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.12994384765625, + "rewards/margins": 6.5859375, + "rewards/rejected": -4.4609375, + "step": 2459 + }, + { + "epoch": 0.488119450369562, + "grad_norm": 32.62696184701605, + "learning_rate": 6.467891766053838e-07, + "logits/chosen": 3.54296875, + "logits/rejected": 3.5078125, + "logps/chosen": -1159.0, + "logps/rejected": -1514.0, + "loss": 0.381, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.55859375, + "rewards/margins": 9.02734375, + "rewards/rejected": -6.46484375, + "step": 2460 + }, + { + "epoch": 0.4883178729103626, + "grad_norm": 28.081760747975327, + "learning_rate": 6.464847820569343e-07, + "logits/chosen": 4.26171875, + "logits/rejected": 4.6484375, + "logps/chosen": -800.0, + "logps/rejected": -1498.5, + "loss": 0.3935, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.251953125, + "rewards/margins": 9.25, + "rewards/rejected": -7.0078125, + "step": 2461 + }, + { + "epoch": 0.48851629545116326, + "grad_norm": 33.00748649881671, + "learning_rate": 6.461803412264488e-07, + "logits/chosen": 3.73828125, + "logits/rejected": 3.7578125, + "logps/chosen": -718.5, + "logps/rejected": -553.5, + "loss": 0.5094, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.3408203125, + "rewards/margins": 5.564453125, + "rewards/rejected": -4.240234375, + "step": 2462 + }, + { + "epoch": 0.48871471799196387, + "grad_norm": 32.99212514648311, + "learning_rate": 6.45875854259962e-07, + "logits/chosen": 3.69921875, + "logits/rejected": 3.73828125, + "logps/chosen": -1118.0, + "logps/rejected": -772.0, + "loss": 0.4985, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.2861328125, + "rewards/margins": 5.828125, + "rewards/rejected": -3.546875, + "step": 2463 + }, + { + "epoch": 0.4889131405327645, + "grad_norm": 43.70329991115316, + "learning_rate": 6.45571321303531e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.1015625, + "logps/chosen": -721.0, + "logps/rejected": -762.5, + "loss": 0.592, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7578125, + "rewards/margins": 5.46484375, + "rewards/rejected": -4.7109375, + "step": 2464 + }, + { + "epoch": 0.4891115630735652, + "grad_norm": 25.31482287327468, + "learning_rate": 6.452667425032349e-07, + "logits/chosen": 4.22265625, + "logits/rejected": 4.09375, + "logps/chosen": -999.0, + "logps/rejected": -712.0, + "loss": 0.3843, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.54541015625, + "rewards/margins": 6.7734375, + "rewards/rejected": -4.21875, + "step": 2465 + }, + { + "epoch": 0.4893099856143658, + "grad_norm": 26.91426251686392, + "learning_rate": 6.449621180051745e-07, + "logits/chosen": 3.67578125, + "logits/rejected": 3.6875, + "logps/chosen": -925.0, + "logps/rejected": -866.0, + "loss": 0.3036, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.51171875, + "rewards/margins": 9.390625, + "rewards/rejected": -6.875, + "step": 2466 + }, + { + "epoch": 0.48950840815516644, + "grad_norm": 30.015384239412477, + "learning_rate": 6.44657447955473e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 4.02734375, + "logps/chosen": -1213.0, + "logps/rejected": -1703.5, + "loss": 0.3296, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6796875, + "rewards/margins": 11.15625, + "rewards/rejected": -8.4921875, + "step": 2467 + }, + { + "epoch": 0.48970683069596704, + "grad_norm": 35.961540107660745, + "learning_rate": 6.443527325002752e-07, + "logits/chosen": 3.58203125, + "logits/rejected": 4.125, + "logps/chosen": -1015.0, + "logps/rejected": -774.0, + "loss": 0.4975, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.19140625, + "rewards/margins": 8.25, + "rewards/rejected": -6.05859375, + "step": 2468 + }, + { + "epoch": 0.4899052532367677, + "grad_norm": 38.7007687526521, + "learning_rate": 6.440479717857476e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 3.97265625, + "logps/chosen": -834.0, + "logps/rejected": -631.0, + "loss": 0.4551, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.3720703125, + "rewards/margins": 7.015625, + "rewards/rejected": -5.64453125, + "step": 2469 + }, + { + "epoch": 0.49010367577756836, + "grad_norm": 32.40801710885238, + "learning_rate": 6.437431659580785e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.015625, + "logps/chosen": -747.5, + "logps/rejected": -566.0, + "loss": 0.4724, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.3388671875, + "rewards/margins": 5.26953125, + "rewards/rejected": -3.92578125, + "step": 2470 + }, + { + "epoch": 0.49030209831836896, + "grad_norm": 32.7014761681757, + "learning_rate": 6.43438315163478e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.06640625, + "logps/chosen": -901.0, + "logps/rejected": -670.0, + "loss": 0.4107, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.6962890625, + "rewards/margins": 6.73046875, + "rewards/rejected": -5.03125, + "step": 2471 + }, + { + "epoch": 0.4905005208591696, + "grad_norm": 28.1408402082797, + "learning_rate": 6.431334195481774e-07, + "logits/chosen": 3.72265625, + "logits/rejected": 3.8125, + "logps/chosen": -1408.0, + "logps/rejected": -776.5, + "loss": 0.471, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.271484375, + "rewards/margins": 7.7265625, + "rewards/rejected": -5.45703125, + "step": 2472 + }, + { + "epoch": 0.4906989433999702, + "grad_norm": 30.15622883181124, + "learning_rate": 6.428284792584299e-07, + "logits/chosen": 3.84375, + "logits/rejected": 3.828125, + "logps/chosen": -1019.0, + "logps/rejected": -874.0, + "loss": 0.3555, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.033203125, + "rewards/margins": 7.921875, + "rewards/rejected": -4.884765625, + "step": 2473 + }, + { + "epoch": 0.4908973659407709, + "grad_norm": 24.965075507813115, + "learning_rate": 6.425234944405095e-07, + "logits/chosen": 3.609375, + "logits/rejected": 3.75390625, + "logps/chosen": -984.5, + "logps/rejected": -666.0, + "loss": 0.4092, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.09765625, + "rewards/margins": 7.4296875, + "rewards/rejected": -5.32421875, + "step": 2474 + }, + { + "epoch": 0.4910957884815715, + "grad_norm": 29.139801784842632, + "learning_rate": 6.422184652407126e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 4.12890625, + "logps/chosen": -1013.0, + "logps/rejected": -726.0, + "loss": 0.4055, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.44140625, + "rewards/margins": 6.6484375, + "rewards/rejected": -4.1953125, + "step": 2475 + }, + { + "epoch": 0.49129421102237214, + "grad_norm": 35.58957539152166, + "learning_rate": 6.419133918053562e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.1328125, + "logps/chosen": -1371.0, + "logps/rejected": -926.0, + "loss": 0.3223, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.21484375, + "rewards/margins": 8.5703125, + "rewards/rejected": -5.3671875, + "step": 2476 + }, + { + "epoch": 0.4914926335631728, + "grad_norm": 32.16866805213591, + "learning_rate": 6.416082742807782e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.84765625, + "logps/chosen": -872.0, + "logps/rejected": -653.0, + "loss": 0.4057, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.91796875, + "rewards/margins": 7.6796875, + "rewards/rejected": -5.7734375, + "step": 2477 + }, + { + "epoch": 0.4916910561039734, + "grad_norm": 26.903715053446458, + "learning_rate": 6.413031128133384e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 4.13671875, + "logps/chosen": -835.0, + "logps/rejected": -785.0, + "loss": 0.3242, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.716796875, + "rewards/margins": 8.4375, + "rewards/rejected": -5.7109375, + "step": 2478 + }, + { + "epoch": 0.49188947864477406, + "grad_norm": 38.24877505305407, + "learning_rate": 6.40997907549417e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.17578125, + "logps/chosen": -892.5, + "logps/rejected": -602.5, + "loss": 0.4557, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.3984375, + "rewards/margins": 6.65625, + "rewards/rejected": -4.2421875, + "step": 2479 + }, + { + "epoch": 0.49208790118557466, + "grad_norm": 35.28393088550019, + "learning_rate": 6.40692658635416e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.28515625, + "logps/chosen": -1098.0, + "logps/rejected": -752.0, + "loss": 0.2445, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.53515625, + "rewards/margins": 8.5078125, + "rewards/rejected": -5.96875, + "step": 2480 + }, + { + "epoch": 0.4922863237263753, + "grad_norm": 35.55907272822122, + "learning_rate": 6.403873662177576e-07, + "logits/chosen": 4.546875, + "logits/rejected": 4.24609375, + "logps/chosen": -1288.0, + "logps/rejected": -568.0, + "loss": 0.4513, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7724609375, + "rewards/margins": 5.171875, + "rewards/rejected": -4.3984375, + "step": 2481 + }, + { + "epoch": 0.492484746267176, + "grad_norm": 42.895699962288994, + "learning_rate": 6.400820304428851e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.32421875, + "logps/chosen": -923.0, + "logps/rejected": -737.5, + "loss": 0.5429, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.642578125, + "rewards/margins": 5.9921875, + "rewards/rejected": -4.359375, + "step": 2482 + }, + { + "epoch": 0.4926831688079766, + "grad_norm": 27.38287410433363, + "learning_rate": 6.397766514572628e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 3.75390625, + "logps/chosen": -833.0, + "logps/rejected": -648.0, + "loss": 0.4838, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.853515625, + "rewards/margins": 6.375, + "rewards/rejected": -4.509765625, + "step": 2483 + }, + { + "epoch": 0.49288159134877724, + "grad_norm": 38.597753553978905, + "learning_rate": 6.394712294073756e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 3.98828125, + "logps/chosen": -1742.0, + "logps/rejected": -827.0, + "loss": 0.4044, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15625, + "rewards/margins": 5.81640625, + "rewards/rejected": -5.6484375, + "step": 2484 + }, + { + "epoch": 0.49308001388957784, + "grad_norm": 35.78978446895109, + "learning_rate": 6.39165764439729e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 3.74609375, + "logps/chosen": -1268.0, + "logps/rejected": -903.0, + "loss": 0.4495, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.470703125, + "rewards/margins": 7.4375, + "rewards/rejected": -4.97265625, + "step": 2485 + }, + { + "epoch": 0.4932784364303785, + "grad_norm": 31.4863485345061, + "learning_rate": 6.388602567008489e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.984375, + "logps/chosen": -733.0, + "logps/rejected": -1169.0, + "loss": 0.3885, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.302734375, + "rewards/margins": 8.6328125, + "rewards/rejected": -6.3359375, + "step": 2486 + }, + { + "epoch": 0.4934768589711791, + "grad_norm": 31.96009786586912, + "learning_rate": 6.385547063372823e-07, + "logits/chosen": 3.6328125, + "logits/rejected": 3.73046875, + "logps/chosen": -954.0, + "logps/rejected": -1103.0, + "loss": 0.4799, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.5693359375, + "rewards/margins": 6.71875, + "rewards/rejected": -5.142578125, + "step": 2487 + }, + { + "epoch": 0.49367528151197976, + "grad_norm": 32.195078629090915, + "learning_rate": 6.382491134955959e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 4.04296875, + "logps/chosen": -837.0, + "logps/rejected": -844.5, + "loss": 0.4596, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5146484375, + "rewards/margins": 7.4609375, + "rewards/rejected": -4.94921875, + "step": 2488 + }, + { + "epoch": 0.4938737040527804, + "grad_norm": 40.589211150231975, + "learning_rate": 6.379434783223775e-07, + "logits/chosen": 3.578125, + "logits/rejected": 3.625, + "logps/chosen": -1069.0, + "logps/rejected": -786.0, + "loss": 0.3441, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.40234375, + "rewards/margins": 7.609375, + "rewards/rejected": -5.2109375, + "step": 2489 + }, + { + "epoch": 0.494072126593581, + "grad_norm": 36.515673868894815, + "learning_rate": 6.376378009642344e-07, + "logits/chosen": 4.046875, + "logits/rejected": 3.9375, + "logps/chosen": -1111.0, + "logps/rejected": -642.0, + "loss": 0.3863, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.58203125, + "rewards/margins": 7.6328125, + "rewards/rejected": -5.0546875, + "step": 2490 + }, + { + "epoch": 0.4942705491343817, + "grad_norm": 25.15899951771285, + "learning_rate": 6.373320815677951e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.8046875, + "logps/chosen": -836.0, + "logps/rejected": -494.5, + "loss": 0.3648, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.490234375, + "rewards/margins": 6.1484375, + "rewards/rejected": -3.65234375, + "step": 2491 + }, + { + "epoch": 0.4944689716751823, + "grad_norm": 31.42316370677527, + "learning_rate": 6.370263202797078e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.421875, + "logps/chosen": -980.0, + "logps/rejected": -727.0, + "loss": 0.4722, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.896484375, + "rewards/margins": 6.125, + "rewards/rejected": -4.234375, + "step": 2492 + }, + { + "epoch": 0.49466739421598294, + "grad_norm": 63.093981191661626, + "learning_rate": 6.367205172466403e-07, + "logits/chosen": 4.578125, + "logits/rejected": 4.59375, + "logps/chosen": -919.0, + "logps/rejected": -1371.5, + "loss": 0.5477, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.52734375, + "rewards/margins": 7.2421875, + "rewards/rejected": -5.7265625, + "step": 2493 + }, + { + "epoch": 0.4948658167567836, + "grad_norm": 32.95265245921425, + "learning_rate": 6.364146726152813e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 3.68359375, + "logps/chosen": -961.0, + "logps/rejected": -595.5, + "loss": 0.4448, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.185546875, + "rewards/margins": 6.390625, + "rewards/rejected": -4.201171875, + "step": 2494 + }, + { + "epoch": 0.4950642392975842, + "grad_norm": 29.125358814923594, + "learning_rate": 6.361087865323385e-07, + "logits/chosen": 3.5, + "logits/rejected": 3.671875, + "logps/chosen": -1032.0, + "logps/rejected": -691.0, + "loss": 0.3694, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.751953125, + "rewards/margins": 8.28125, + "rewards/rejected": -5.52734375, + "step": 2495 + }, + { + "epoch": 0.49526266183838485, + "grad_norm": 34.14679053759402, + "learning_rate": 6.358028591445407e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.16015625, + "logps/chosen": -715.5, + "logps/rejected": -615.5, + "loss": 0.385, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.8515625, + "rewards/margins": 7.2265625, + "rewards/rejected": -5.390625, + "step": 2496 + }, + { + "epoch": 0.49546108437918546, + "grad_norm": 37.03709102438802, + "learning_rate": 6.354968905986355e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 4.046875, + "logps/chosen": -1034.0, + "logps/rejected": -706.0, + "loss": 0.3465, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.93359375, + "rewards/margins": 9.1328125, + "rewards/rejected": -6.1875, + "step": 2497 + }, + { + "epoch": 0.4956595069199861, + "grad_norm": 34.84957081543594, + "learning_rate": 6.351908810413904e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.25, + "logps/chosen": -1239.0, + "logps/rejected": -800.5, + "loss": 0.3048, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.001953125, + "rewards/margins": 8.7578125, + "rewards/rejected": -5.7578125, + "step": 2498 + }, + { + "epoch": 0.4958579294607868, + "grad_norm": 30.01060019824449, + "learning_rate": 6.34884830619593e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.38671875, + "logps/chosen": -1093.0, + "logps/rejected": -1247.0, + "loss": 0.2878, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.806640625, + "rewards/margins": 10.828125, + "rewards/rejected": -8.0390625, + "step": 2499 + }, + { + "epoch": 0.4960563520015874, + "grad_norm": 28.31211573471761, + "learning_rate": 6.345787394800503e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.01171875, + "logps/chosen": -1208.0, + "logps/rejected": -759.0, + "loss": 0.3841, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.646484375, + "rewards/margins": 7.6953125, + "rewards/rejected": -5.0390625, + "step": 2500 + }, + { + "epoch": 0.49625477454238803, + "grad_norm": 32.1673841342832, + "learning_rate": 6.342726077695886e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 3.9453125, + "logps/chosen": -825.0, + "logps/rejected": -653.0, + "loss": 0.447, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.76171875, + "rewards/margins": 6.0390625, + "rewards/rejected": -4.27734375, + "step": 2501 + }, + { + "epoch": 0.49645319708318864, + "grad_norm": 33.085046908807655, + "learning_rate": 6.339664356350539e-07, + "logits/chosen": 3.91015625, + "logits/rejected": 3.859375, + "logps/chosen": -1265.0, + "logps/rejected": -813.0, + "loss": 0.4056, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.927734375, + "rewards/margins": 6.69921875, + "rewards/rejected": -4.76953125, + "step": 2502 + }, + { + "epoch": 0.4966516196239893, + "grad_norm": 30.84097743438943, + "learning_rate": 6.336602232233115e-07, + "logits/chosen": 4.25, + "logits/rejected": 4.125, + "logps/chosen": -850.5, + "logps/rejected": -848.5, + "loss": 0.4984, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.46875, + "rewards/margins": 6.4296875, + "rewards/rejected": -3.9609375, + "step": 2503 + }, + { + "epoch": 0.4968500421647899, + "grad_norm": 31.55102987318661, + "learning_rate": 6.333539706812463e-07, + "logits/chosen": 4.328125, + "logits/rejected": 3.96484375, + "logps/chosen": -989.0, + "logps/rejected": -1687.0, + "loss": 0.5488, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.523193359375, + "rewards/margins": 7.609375, + "rewards/rejected": -6.078125, + "step": 2504 + }, + { + "epoch": 0.49704846470559055, + "grad_norm": 37.68729162032727, + "learning_rate": 6.330476781557617e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 4.25, + "logps/chosen": -954.0, + "logps/rejected": -786.0, + "loss": 0.4096, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.478515625, + "rewards/margins": 16.859375, + "rewards/rejected": -15.37109375, + "step": 2505 + }, + { + "epoch": 0.4972468872463912, + "grad_norm": 25.05012765424016, + "learning_rate": 6.327413457937811e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 3.96875, + "logps/chosen": -872.0, + "logps/rejected": -726.0, + "loss": 0.4903, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.4658203125, + "rewards/margins": 6.16064453125, + "rewards/rejected": -3.6787109375, + "step": 2506 + }, + { + "epoch": 0.4974453097871918, + "grad_norm": 29.47503705820922, + "learning_rate": 6.324349737422469e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.15625, + "logps/chosen": -788.0, + "logps/rejected": -579.0, + "loss": 0.5026, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.388671875, + "rewards/margins": 4.79296875, + "rewards/rejected": -2.39453125, + "step": 2507 + }, + { + "epoch": 0.49764373232799247, + "grad_norm": 39.756296468424594, + "learning_rate": 6.3212856214812e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.10546875, + "logps/chosen": -815.5, + "logps/rejected": -512.0, + "loss": 0.4636, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.587890625, + "rewards/margins": 5.4921875, + "rewards/rejected": -2.89453125, + "step": 2508 + }, + { + "epoch": 0.4978421548687931, + "grad_norm": 35.01825793509393, + "learning_rate": 6.318221111583809e-07, + "logits/chosen": 4.25, + "logits/rejected": 4.16796875, + "logps/chosen": -1026.0, + "logps/rejected": -1194.0, + "loss": 0.5012, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.208984375, + "rewards/margins": 5.6484375, + "rewards/rejected": -3.44140625, + "step": 2509 + }, + { + "epoch": 0.49804057740959373, + "grad_norm": 32.75693630188166, + "learning_rate": 6.315156209200282e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.8203125, + "logps/chosen": -931.5, + "logps/rejected": -800.5, + "loss": 0.4553, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.81640625, + "rewards/margins": 5.86328125, + "rewards/rejected": -4.04296875, + "step": 2510 + }, + { + "epoch": 0.4982389999503944, + "grad_norm": 29.53251375745422, + "learning_rate": 6.312090915800803e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.3046875, + "logps/chosen": -739.0, + "logps/rejected": -675.0, + "loss": 0.4364, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.03125, + "rewards/margins": 6.7734375, + "rewards/rejected": -4.75390625, + "step": 2511 + }, + { + "epoch": 0.498437422491195, + "grad_norm": 40.386572056596975, + "learning_rate": 6.309025232855737e-07, + "logits/chosen": 4.09375, + "logits/rejected": 4.234375, + "logps/chosen": -819.0, + "logps/rejected": -726.0, + "loss": 0.4433, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.955078125, + "rewards/margins": 6.12890625, + "rewards/rejected": -4.169921875, + "step": 2512 + }, + { + "epoch": 0.49863584503199565, + "grad_norm": 26.9446418462019, + "learning_rate": 6.305959161835637e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.8828125, + "logps/chosen": -1216.0, + "logps/rejected": -1140.0, + "loss": 0.312, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3046875, + "rewards/margins": 9.4765625, + "rewards/rejected": -6.17578125, + "step": 2513 + }, + { + "epoch": 0.49883426757279625, + "grad_norm": 36.19003781011854, + "learning_rate": 6.302892704211244e-07, + "logits/chosen": 3.69921875, + "logits/rejected": 3.625, + "logps/chosen": -1222.0, + "logps/rejected": -951.0, + "loss": 0.4168, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.630859375, + "rewards/margins": 8.2421875, + "rewards/rejected": -5.59375, + "step": 2514 + }, + { + "epoch": 0.4990326901135969, + "grad_norm": 26.234576712274965, + "learning_rate": 6.299825861453483e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 3.71484375, + "logps/chosen": -874.0, + "logps/rejected": -616.5, + "loss": 0.4563, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.982421875, + "rewards/margins": 6.8359375, + "rewards/rejected": -4.84375, + "step": 2515 + }, + { + "epoch": 0.4992311126543975, + "grad_norm": 38.11825445572487, + "learning_rate": 6.296758635033464e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 3.9296875, + "logps/chosen": -842.0, + "logps/rejected": -714.5, + "loss": 0.5848, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.21728515625, + "rewards/margins": 5.259765625, + "rewards/rejected": -4.048828125, + "step": 2516 + }, + { + "epoch": 0.49942953519519817, + "grad_norm": 37.6909862310381, + "learning_rate": 6.293691026422481e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 3.69140625, + "logps/chosen": -747.0, + "logps/rejected": -643.5, + "loss": 0.552, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.5546875, + "rewards/margins": 5.5234375, + "rewards/rejected": -3.96875, + "step": 2517 + }, + { + "epoch": 0.49962795773599883, + "grad_norm": 36.26616013510756, + "learning_rate": 6.29062303709201e-07, + "logits/chosen": 4.453125, + "logits/rejected": 4.4921875, + "logps/chosen": -1426.0, + "logps/rejected": -908.5, + "loss": 0.4336, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.693359375, + "rewards/margins": 17.5546875, + "rewards/rejected": -14.875, + "step": 2518 + }, + { + "epoch": 0.49982638027679943, + "grad_norm": 31.07984881079563, + "learning_rate": 6.287554668513713e-07, + "logits/chosen": 3.7109375, + "logits/rejected": 3.7265625, + "logps/chosen": -795.5, + "logps/rejected": -733.5, + "loss": 0.3755, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3037109375, + "rewards/margins": 7.7890625, + "rewards/rejected": -5.4765625, + "step": 2519 + }, + { + "epoch": 0.5000248028176001, + "grad_norm": 26.866730120568132, + "learning_rate": 6.284485922159431e-07, + "logits/chosen": 3.80859375, + "logits/rejected": 3.77734375, + "logps/chosen": -1022.0, + "logps/rejected": -713.0, + "loss": 0.3562, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9801025390625, + "rewards/margins": 8.41015625, + "rewards/rejected": -6.4375, + "step": 2520 + }, + { + "epoch": 0.5002232253584007, + "grad_norm": 26.811935930116128, + "learning_rate": 6.281416799501187e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 3.97265625, + "logps/chosen": -980.0, + "logps/rejected": -725.0, + "loss": 0.3895, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.46875, + "rewards/margins": 7.3359375, + "rewards/rejected": -4.8671875, + "step": 2521 + }, + { + "epoch": 0.5004216478992013, + "grad_norm": 26.24887158449904, + "learning_rate": 6.278347302011184e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 3.734375, + "logps/chosen": -1329.0, + "logps/rejected": -801.0, + "loss": 0.255, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.078125, + "rewards/margins": 9.625, + "rewards/rejected": -6.5234375, + "step": 2522 + }, + { + "epoch": 0.500620070440002, + "grad_norm": 35.58563666741696, + "learning_rate": 6.275277431161806e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.171875, + "logps/chosen": -854.0, + "logps/rejected": -641.0, + "loss": 0.4095, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.306640625, + "rewards/margins": 7.15625, + "rewards/rejected": -4.83203125, + "step": 2523 + }, + { + "epoch": 0.5008184929808026, + "grad_norm": 27.282997343262352, + "learning_rate": 6.272207188425618e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.15625, + "logps/chosen": -1160.0, + "logps/rejected": -683.5, + "loss": 0.4346, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.814453125, + "rewards/margins": 7.23828125, + "rewards/rejected": -5.421875, + "step": 2524 + }, + { + "epoch": 0.5010169155216032, + "grad_norm": 46.53051146367882, + "learning_rate": 6.269136575275358e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 3.78125, + "logps/chosen": -1037.0, + "logps/rejected": -1239.0, + "loss": 0.4363, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.908203125, + "rewards/margins": 6.953125, + "rewards/rejected": -5.05859375, + "step": 2525 + }, + { + "epoch": 0.5012153380624039, + "grad_norm": 27.627071085864543, + "learning_rate": 6.266065593183944e-07, + "logits/chosen": 4.7109375, + "logits/rejected": 4.5859375, + "logps/chosen": -1207.5, + "logps/rejected": -821.0, + "loss": 0.3936, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.48828125, + "rewards/margins": 16.1796875, + "rewards/rejected": -13.68359375, + "step": 2526 + }, + { + "epoch": 0.5014137606032045, + "grad_norm": 29.36602914361844, + "learning_rate": 6.262994243624474e-07, + "logits/chosen": 4.3125, + "logits/rejected": 3.9609375, + "logps/chosen": -938.0, + "logps/rejected": -1541.0, + "loss": 0.3116, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.748046875, + "rewards/margins": 10.0859375, + "rewards/rejected": -7.33203125, + "step": 2527 + }, + { + "epoch": 0.5016121831440051, + "grad_norm": 36.916234689189764, + "learning_rate": 6.259922528070218e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.19140625, + "logps/chosen": -882.0, + "logps/rejected": -567.5, + "loss": 0.4636, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.185546875, + "rewards/margins": 6.2578125, + "rewards/rejected": -4.0625, + "step": 2528 + }, + { + "epoch": 0.5018106056848058, + "grad_norm": 33.51079692957631, + "learning_rate": 6.256850447994624e-07, + "logits/chosen": 3.77734375, + "logits/rejected": 3.96484375, + "logps/chosen": -981.0, + "logps/rejected": -737.0, + "loss": 0.3978, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.373046875, + "rewards/margins": 6.80859375, + "rewards/rejected": -4.4384765625, + "step": 2529 + }, + { + "epoch": 0.5020090282256064, + "grad_norm": 42.657300580517656, + "learning_rate": 6.253778004871314e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 3.8515625, + "logps/chosen": -709.0, + "logps/rejected": -1310.0, + "loss": 0.4799, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.716796875, + "rewards/margins": 10.5, + "rewards/rejected": -8.7734375, + "step": 2530 + }, + { + "epoch": 0.502207450766407, + "grad_norm": 42.07367740415488, + "learning_rate": 6.250705200174084e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.1875, + "logps/chosen": -1013.0, + "logps/rejected": -753.0, + "loss": 0.312, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0, + "rewards/margins": 8.0625, + "rewards/rejected": -5.05078125, + "step": 2531 + }, + { + "epoch": 0.5024058733072077, + "grad_norm": 42.71634521515174, + "learning_rate": 6.247632035376905e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.2734375, + "logps/chosen": -1001.0, + "logps/rejected": -794.0, + "loss": 0.3375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5390625, + "rewards/margins": 7.0859375, + "rewards/rejected": -4.54296875, + "step": 2532 + }, + { + "epoch": 0.5026042958480084, + "grad_norm": 34.54233549873875, + "learning_rate": 6.24455851195392e-07, + "logits/chosen": 4.5390625, + "logits/rejected": 4.7265625, + "logps/chosen": -979.0, + "logps/rejected": -809.0, + "loss": 0.4005, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.9375, + "rewards/margins": 7.828125, + "rewards/rejected": -5.90234375, + "step": 2533 + }, + { + "epoch": 0.502802718388809, + "grad_norm": 31.758657167806554, + "learning_rate": 6.241484631379443e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.640625, + "logps/chosen": -1069.0, + "logps/rejected": -753.0, + "loss": 0.4505, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.509765625, + "rewards/margins": 5.90625, + "rewards/rejected": -4.38671875, + "step": 2534 + }, + { + "epoch": 0.5030011409296096, + "grad_norm": 40.52861676793424, + "learning_rate": 6.238410395127958e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.0859375, + "logps/chosen": -1098.0, + "logps/rejected": -750.0, + "loss": 0.4481, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.048828125, + "rewards/margins": 5.94921875, + "rewards/rejected": -3.90234375, + "step": 2535 + }, + { + "epoch": 0.5031995634704103, + "grad_norm": 34.430538229028066, + "learning_rate": 6.235335804674124e-07, + "logits/chosen": 4.484375, + "logits/rejected": 3.98828125, + "logps/chosen": -907.0, + "logps/rejected": -645.0, + "loss": 0.4274, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.08203125, + "rewards/margins": 6.34375, + "rewards/rejected": -4.259765625, + "step": 2536 + }, + { + "epoch": 0.5033979860112109, + "grad_norm": 31.924143159638874, + "learning_rate": 6.232260861492766e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 3.9296875, + "logps/chosen": -1014.5, + "logps/rejected": -696.0, + "loss": 0.5471, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.76611328125, + "rewards/margins": 5.287109375, + "rewards/rejected": -3.51953125, + "step": 2537 + }, + { + "epoch": 0.5035964085520115, + "grad_norm": 27.464369324654502, + "learning_rate": 6.229185567058879e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.0859375, + "logps/chosen": -878.0, + "logps/rejected": -698.0, + "loss": 0.3807, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.353515625, + "rewards/margins": 17.4453125, + "rewards/rejected": -15.0703125, + "step": 2538 + }, + { + "epoch": 0.5037948310928121, + "grad_norm": 32.05321423687428, + "learning_rate": 6.22610992284763e-07, + "logits/chosen": 3.80859375, + "logits/rejected": 3.82421875, + "logps/chosen": -1192.0, + "logps/rejected": -662.5, + "loss": 0.3409, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.65234375, + "rewards/margins": 8.796875, + "rewards/rejected": -6.1484375, + "step": 2539 + }, + { + "epoch": 0.5039932536336128, + "grad_norm": 30.006035742832253, + "learning_rate": 6.223033930334348e-07, + "logits/chosen": 4.0625, + "logits/rejected": 3.86328125, + "logps/chosen": -826.0, + "logps/rejected": -631.5, + "loss": 0.3732, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.05859375, + "rewards/margins": 6.3671875, + "rewards/rejected": -4.3046875, + "step": 2540 + }, + { + "epoch": 0.5041916761744134, + "grad_norm": 26.792441496330948, + "learning_rate": 6.219957590994533e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.359375, + "logps/chosen": -827.0, + "logps/rejected": -602.5, + "loss": 0.4328, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.017578125, + "rewards/margins": 6.8046875, + "rewards/rejected": -4.78125, + "step": 2541 + }, + { + "epoch": 0.504390098715214, + "grad_norm": 38.69700310313647, + "learning_rate": 6.216880906303852e-07, + "logits/chosen": 3.58203125, + "logits/rejected": 3.65625, + "logps/chosen": -896.0, + "logps/rejected": -697.0, + "loss": 0.4782, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.787109375, + "rewards/margins": 7.21875, + "rewards/rejected": -5.4296875, + "step": 2542 + }, + { + "epoch": 0.5045885212560147, + "grad_norm": 30.932026925392144, + "learning_rate": 6.213803877738133e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.86328125, + "logps/chosen": -993.0, + "logps/rejected": -1134.0, + "loss": 0.4015, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.919921875, + "rewards/margins": 8.02734375, + "rewards/rejected": -6.09765625, + "step": 2543 + }, + { + "epoch": 0.5047869437968153, + "grad_norm": 38.960495792341234, + "learning_rate": 6.210726506773374e-07, + "logits/chosen": 4.0625, + "logits/rejected": 3.91796875, + "logps/chosen": -1063.0, + "logps/rejected": -707.0, + "loss": 0.4443, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.953125, + "rewards/margins": 7.578125, + "rewards/rejected": -5.62890625, + "step": 2544 + }, + { + "epoch": 0.5049853663376159, + "grad_norm": 36.91091198753716, + "learning_rate": 6.207648794885734e-07, + "logits/chosen": 4.328125, + "logits/rejected": 4.08203125, + "logps/chosen": -816.0, + "logps/rejected": -752.5, + "loss": 0.5617, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.779296875, + "rewards/margins": 5.890625, + "rewards/rejected": -4.103515625, + "step": 2545 + }, + { + "epoch": 0.5051837888784166, + "grad_norm": 50.589528792102, + "learning_rate": 6.204570743551537e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 3.625, + "logps/chosen": -1129.0, + "logps/rejected": -754.0, + "loss": 0.3402, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.41015625, + "rewards/margins": 7.8828125, + "rewards/rejected": -4.47265625, + "step": 2546 + }, + { + "epoch": 0.5053822114192172, + "grad_norm": 34.91043203270989, + "learning_rate": 6.20149235424727e-07, + "logits/chosen": 4.21875, + "logits/rejected": 3.87890625, + "logps/chosen": -1138.0, + "logps/rejected": -754.5, + "loss": 0.3708, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.900390625, + "rewards/margins": 8.5078125, + "rewards/rejected": -6.625, + "step": 2547 + }, + { + "epoch": 0.5055806339600178, + "grad_norm": 35.336562051069365, + "learning_rate": 6.198413628449581e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.0078125, + "logps/chosen": -962.0, + "logps/rejected": -946.0, + "loss": 0.4217, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.330078125, + "rewards/margins": 8.0859375, + "rewards/rejected": -6.75390625, + "step": 2548 + }, + { + "epoch": 0.5057790565008184, + "grad_norm": 33.509609263245835, + "learning_rate": 6.195334567635283e-07, + "logits/chosen": 3.859375, + "logits/rejected": 3.95703125, + "logps/chosen": -1272.0, + "logps/rejected": -836.0, + "loss": 0.2973, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.849609375, + "rewards/margins": 9.875, + "rewards/rejected": -7.0234375, + "step": 2549 + }, + { + "epoch": 0.5059774790416192, + "grad_norm": 34.0967477591387, + "learning_rate": 6.192255173281342e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.171875, + "logps/chosen": -1135.0, + "logps/rejected": -897.0, + "loss": 0.3773, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.1298828125, + "rewards/margins": 8.3203125, + "rewards/rejected": -6.1953125, + "step": 2550 + }, + { + "epoch": 0.5061759015824198, + "grad_norm": 36.73483860895207, + "learning_rate": 6.189175446864892e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.80078125, + "logps/chosen": -1228.0, + "logps/rejected": -756.0, + "loss": 0.3006, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.671875, + "rewards/margins": 9.03125, + "rewards/rejected": -6.359375, + "step": 2551 + }, + { + "epoch": 0.5063743241232204, + "grad_norm": 27.794910036297768, + "learning_rate": 6.186095389863222e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 3.5546875, + "logps/chosen": -1256.0, + "logps/rejected": -839.5, + "loss": 0.3669, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.48828125, + "rewards/margins": 9.7109375, + "rewards/rejected": -7.2265625, + "step": 2552 + }, + { + "epoch": 0.5065727466640211, + "grad_norm": 31.959171229899468, + "learning_rate": 6.18301500375378e-07, + "logits/chosen": 4.78125, + "logits/rejected": 4.8046875, + "logps/chosen": -812.0, + "logps/rejected": -1206.0, + "loss": 0.4563, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.0, + "rewards/margins": 6.8359375, + "rewards/rejected": -4.8203125, + "step": 2553 + }, + { + "epoch": 0.5067711692048217, + "grad_norm": 31.755582847782144, + "learning_rate": 6.179934290014173e-07, + "logits/chosen": 3.73046875, + "logits/rejected": 3.8515625, + "logps/chosen": -965.0, + "logps/rejected": -786.0, + "loss": 0.3435, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.24609375, + "rewards/margins": 7.6875, + "rewards/rejected": -5.45703125, + "step": 2554 + }, + { + "epoch": 0.5069695917456223, + "grad_norm": 31.240966694530524, + "learning_rate": 6.176853250122167e-07, + "logits/chosen": 4.078125, + "logits/rejected": 4.140625, + "logps/chosen": -986.0, + "logps/rejected": -1390.5, + "loss": 0.3318, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.640625, + "rewards/margins": 12.6484375, + "rewards/rejected": -9.984375, + "step": 2555 + }, + { + "epoch": 0.5071680142864229, + "grad_norm": 25.95044247485374, + "learning_rate": 6.173771885555679e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 4.046875, + "logps/chosen": -837.5, + "logps/rejected": -651.5, + "loss": 0.5247, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.5556640625, + "rewards/margins": 5.3046875, + "rewards/rejected": -3.75, + "step": 2556 + }, + { + "epoch": 0.5073664368272236, + "grad_norm": 37.16465620390846, + "learning_rate": 6.170690197792784e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 4.3828125, + "logps/chosen": -842.0, + "logps/rejected": -909.0, + "loss": 0.3997, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.861328125, + "rewards/margins": 7.40625, + "rewards/rejected": -4.54296875, + "step": 2557 + }, + { + "epoch": 0.5075648593680242, + "grad_norm": 35.40603018075707, + "learning_rate": 6.167608188311718e-07, + "logits/chosen": 4.390625, + "logits/rejected": 4.4921875, + "logps/chosen": -1390.0, + "logps/rejected": -1261.0, + "loss": 0.3676, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.099609375, + "rewards/margins": 8.375, + "rewards/rejected": -5.26953125, + "step": 2558 + }, + { + "epoch": 0.5077632819088248, + "grad_norm": 37.19957580943896, + "learning_rate": 6.164525858590861e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 4.015625, + "logps/chosen": -1074.0, + "logps/rejected": -697.5, + "loss": 0.4665, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7421875, + "rewards/margins": 7.46484375, + "rewards/rejected": -4.7265625, + "step": 2559 + }, + { + "epoch": 0.5079617044496255, + "grad_norm": 35.19573555388864, + "learning_rate": 6.161443210108755e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 3.9609375, + "logps/chosen": -1369.0, + "logps/rejected": -839.5, + "loss": 0.4146, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.95703125, + "rewards/margins": 8.0078125, + "rewards/rejected": -4.06640625, + "step": 2560 + }, + { + "epoch": 0.5081601269904261, + "grad_norm": 31.794460484946264, + "learning_rate": 6.158360244344089e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.9765625, + "logps/chosen": -777.5, + "logps/rejected": -547.0, + "loss": 0.5689, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.7236328125, + "rewards/margins": 5.62109375, + "rewards/rejected": -3.888671875, + "step": 2561 + }, + { + "epoch": 0.5083585495312267, + "grad_norm": 34.06615143381865, + "learning_rate": 6.155276962775709e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.15234375, + "logps/chosen": -831.0, + "logps/rejected": -554.5, + "loss": 0.6312, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.757080078125, + "rewards/margins": 5.61572265625, + "rewards/rejected": -4.84765625, + "step": 2562 + }, + { + "epoch": 0.5085569720720274, + "grad_norm": 34.429408713653956, + "learning_rate": 6.15219336688261e-07, + "logits/chosen": 4.046875, + "logits/rejected": 4.08203125, + "logps/chosen": -1037.0, + "logps/rejected": -1744.0, + "loss": 0.3946, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3720703125, + "rewards/margins": 9.21875, + "rewards/rejected": -6.8359375, + "step": 2563 + }, + { + "epoch": 0.508755394612828, + "grad_norm": 37.037184213797545, + "learning_rate": 6.149109458143938e-07, + "logits/chosen": 3.921875, + "logits/rejected": 3.8828125, + "logps/chosen": -1075.0, + "logps/rejected": -789.5, + "loss": 0.4229, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.75390625, + "rewards/margins": 7.390625, + "rewards/rejected": -4.63671875, + "step": 2564 + }, + { + "epoch": 0.5089538171536286, + "grad_norm": 35.663716975153854, + "learning_rate": 6.14602523803899e-07, + "logits/chosen": 3.50390625, + "logits/rejected": 3.38671875, + "logps/chosen": -1292.0, + "logps/rejected": -1032.0, + "loss": 0.3121, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.03125, + "rewards/margins": 8.3125, + "rewards/rejected": -5.283203125, + "step": 2565 + }, + { + "epoch": 0.5091522396944292, + "grad_norm": 37.05042026578558, + "learning_rate": 6.14294070804721e-07, + "logits/chosen": 3.58984375, + "logits/rejected": 3.421875, + "logps/chosen": -830.0, + "logps/rejected": -550.0, + "loss": 0.426, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.765625, + "rewards/margins": 6.3671875, + "rewards/rejected": -3.6015625, + "step": 2566 + }, + { + "epoch": 0.50935066223523, + "grad_norm": 27.732076997006548, + "learning_rate": 6.139855869648193e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 4.3359375, + "logps/chosen": -900.0, + "logps/rejected": -542.0, + "loss": 0.4694, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.81640625, + "rewards/margins": 7.6640625, + "rewards/rejected": -4.8515625, + "step": 2567 + }, + { + "epoch": 0.5095490847760306, + "grad_norm": 32.927157472069744, + "learning_rate": 6.136770724321682e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 4.00390625, + "logps/chosen": -1068.0, + "logps/rejected": -782.0, + "loss": 0.462, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.0009765625, + "rewards/margins": 6.578125, + "rewards/rejected": -4.5859375, + "step": 2568 + }, + { + "epoch": 0.5097475073168312, + "grad_norm": 32.62063519811396, + "learning_rate": 6.133685273547566e-07, + "logits/chosen": 3.75, + "logits/rejected": 3.7578125, + "logps/chosen": -1109.0, + "logps/rejected": -1118.0, + "loss": 0.4769, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.21484375, + "rewards/margins": 11.0703125, + "rewards/rejected": -8.857421875, + "step": 2569 + }, + { + "epoch": 0.5099459298576319, + "grad_norm": 31.63008566044727, + "learning_rate": 6.130599518805878e-07, + "logits/chosen": 3.8515625, + "logits/rejected": 3.953125, + "logps/chosen": -975.5, + "logps/rejected": -769.0, + "loss": 0.4866, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.537109375, + "rewards/margins": 7.4765625, + "rewards/rejected": -4.9453125, + "step": 2570 + }, + { + "epoch": 0.5101443523984325, + "grad_norm": 34.372609085486154, + "learning_rate": 6.127513461576805e-07, + "logits/chosen": 3.74609375, + "logits/rejected": 3.890625, + "logps/chosen": -918.0, + "logps/rejected": -737.5, + "loss": 0.5234, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.78125, + "rewards/margins": 6.140625, + "rewards/rejected": -4.37109375, + "step": 2571 + }, + { + "epoch": 0.5103427749392331, + "grad_norm": 33.18108174529522, + "learning_rate": 6.124427103340669e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.75, + "logps/chosen": -752.0, + "logps/rejected": -1619.0, + "loss": 0.3527, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.244140625, + "rewards/margins": 12.7265625, + "rewards/rejected": -10.46875, + "step": 2572 + }, + { + "epoch": 0.5105411974800337, + "grad_norm": 34.03886762571987, + "learning_rate": 6.121340445577945e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 3.65625, + "logps/chosen": -1270.0, + "logps/rejected": -830.5, + "loss": 0.4047, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.060546875, + "rewards/margins": 8.28125, + "rewards/rejected": -6.2265625, + "step": 2573 + }, + { + "epoch": 0.5107396200208344, + "grad_norm": 39.30611210172156, + "learning_rate": 6.118253489769247e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.26953125, + "logps/chosen": -947.0, + "logps/rejected": -766.5, + "loss": 0.5192, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4296875, + "rewards/margins": 6.3671875, + "rewards/rejected": -3.94140625, + "step": 2574 + }, + { + "epoch": 0.510938042561635, + "grad_norm": 42.28537835728778, + "learning_rate": 6.115166237395331e-07, + "logits/chosen": 3.85546875, + "logits/rejected": 3.95703125, + "logps/chosen": -1255.0, + "logps/rejected": -972.0, + "loss": 0.4221, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.537109375, + "rewards/margins": 8.359375, + "rewards/rejected": -5.8203125, + "step": 2575 + }, + { + "epoch": 0.5111364651024356, + "grad_norm": 23.70962136189678, + "learning_rate": 6.112078689937098e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 4.1015625, + "logps/chosen": -918.0, + "logps/rejected": -969.0, + "loss": 0.3559, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.31640625, + "rewards/margins": 10.0625, + "rewards/rejected": -6.7578125, + "step": 2576 + }, + { + "epoch": 0.5113348876432363, + "grad_norm": 38.79778773550236, + "learning_rate": 6.10899084887559e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.44140625, + "logps/chosen": -903.0, + "logps/rejected": -611.5, + "loss": 0.4425, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.4716796875, + "rewards/margins": 6.8203125, + "rewards/rejected": -5.3515625, + "step": 2577 + }, + { + "epoch": 0.5115333101840369, + "grad_norm": 32.32815593754552, + "learning_rate": 6.105902715691989e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 3.953125, + "logps/chosen": -1039.0, + "logps/rejected": -800.5, + "loss": 0.407, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.900390625, + "rewards/margins": 6.6328125, + "rewards/rejected": -3.734375, + "step": 2578 + }, + { + "epoch": 0.5117317327248375, + "grad_norm": 32.74811822062414, + "learning_rate": 6.10281429186762e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.234375, + "logps/chosen": -784.0, + "logps/rejected": -804.0, + "loss": 0.4742, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.55029296875, + "rewards/margins": 6.296875, + "rewards/rejected": -4.7578125, + "step": 2579 + }, + { + "epoch": 0.5119301552656382, + "grad_norm": 32.533809821986694, + "learning_rate": 6.099725578883942e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.15625, + "logps/chosen": -841.0, + "logps/rejected": -884.0, + "loss": 0.3966, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.6796875, + "rewards/margins": 7.2734375, + "rewards/rejected": -5.5859375, + "step": 2580 + }, + { + "epoch": 0.5121285778064388, + "grad_norm": 30.49133493470565, + "learning_rate": 6.096636578222557e-07, + "logits/chosen": 3.84375, + "logits/rejected": 3.3515625, + "logps/chosen": -1246.0, + "logps/rejected": -579.0, + "loss": 0.4675, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3828125, + "rewards/margins": 5.428466796875, + "rewards/rejected": -5.0546875, + "step": 2581 + }, + { + "epoch": 0.5123270003472394, + "grad_norm": 32.996637231157536, + "learning_rate": 6.093547291365203e-07, + "logits/chosen": 3.69921875, + "logits/rejected": 3.46484375, + "logps/chosen": -783.0, + "logps/rejected": -1087.0, + "loss": 0.2602, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.58984375, + "rewards/margins": 9.21875, + "rewards/rejected": -6.6171875, + "step": 2582 + }, + { + "epoch": 0.51252542288804, + "grad_norm": 34.30112138204938, + "learning_rate": 6.090457719793756e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 4.30078125, + "logps/chosen": -1119.0, + "logps/rejected": -676.5, + "loss": 0.385, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.27734375, + "rewards/margins": 8.921875, + "rewards/rejected": -5.6484375, + "step": 2583 + }, + { + "epoch": 0.5127238454288408, + "grad_norm": 28.950024998836074, + "learning_rate": 6.087367864990232e-07, + "logits/chosen": 4.078125, + "logits/rejected": 3.87890625, + "logps/chosen": -946.0, + "logps/rejected": -1214.5, + "loss": 0.3303, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.26953125, + "rewards/margins": 9.3515625, + "rewards/rejected": -7.08203125, + "step": 2584 + }, + { + "epoch": 0.5129222679696414, + "grad_norm": 31.316755127886925, + "learning_rate": 6.084277728436777e-07, + "logits/chosen": 3.953125, + "logits/rejected": 4.1328125, + "logps/chosen": -481.0, + "logps/rejected": -479.0, + "loss": 0.5948, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.876953125, + "rewards/margins": 4.578125, + "rewards/rejected": -3.703125, + "step": 2585 + }, + { + "epoch": 0.513120690510442, + "grad_norm": 36.12944223839041, + "learning_rate": 6.081187311615671e-07, + "logits/chosen": 4.26171875, + "logits/rejected": 4.28515625, + "logps/chosen": -842.0, + "logps/rejected": -689.0, + "loss": 0.4697, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.92578125, + "rewards/margins": 6.47265625, + "rewards/rejected": -3.55078125, + "step": 2586 + }, + { + "epoch": 0.5133191130512427, + "grad_norm": 23.019473375756803, + "learning_rate": 6.078096616009336e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.19140625, + "logps/chosen": -779.0, + "logps/rejected": -479.0, + "loss": 0.4218, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.41015625, + "rewards/margins": 6.35546875, + "rewards/rejected": -3.943359375, + "step": 2587 + }, + { + "epoch": 0.5135175355920433, + "grad_norm": 33.354480819092295, + "learning_rate": 6.075005643100325e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.1796875, + "logps/chosen": -883.5, + "logps/rejected": -754.5, + "loss": 0.583, + "rewards/accuracies": 0.65625, + "rewards/chosen": 1.76171875, + "rewards/margins": 6.267578125, + "rewards/rejected": -4.5166015625, + "step": 2588 + }, + { + "epoch": 0.5137159581328439, + "grad_norm": 36.1209346292482, + "learning_rate": 6.071914394371321e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 4.09765625, + "logps/chosen": -933.0, + "logps/rejected": -689.0, + "loss": 0.3408, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.431640625, + "rewards/margins": 8.078125, + "rewards/rejected": -5.640625, + "step": 2589 + }, + { + "epoch": 0.5139143806736445, + "grad_norm": 33.150859367036986, + "learning_rate": 6.06882287130514e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 3.890625, + "logps/chosen": -1033.0, + "logps/rejected": -872.0, + "loss": 0.4092, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.4140625, + "rewards/margins": 7.3203125, + "rewards/rejected": -4.90625, + "step": 2590 + }, + { + "epoch": 0.5141128032144452, + "grad_norm": 22.730859469144445, + "learning_rate": 6.065731075384732e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.11328125, + "logps/chosen": -646.0, + "logps/rejected": -510.5, + "loss": 0.4877, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.0478515625, + "rewards/margins": 7.84375, + "rewards/rejected": -6.78515625, + "step": 2591 + }, + { + "epoch": 0.5143112257552458, + "grad_norm": 33.15716160703052, + "learning_rate": 6.062639008093177e-07, + "logits/chosen": 3.65625, + "logits/rejected": 3.609375, + "logps/chosen": -1072.0, + "logps/rejected": -931.0, + "loss": 0.4721, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4892578125, + "rewards/margins": 8.125, + "rewards/rejected": -5.6328125, + "step": 2592 + }, + { + "epoch": 0.5145096482960464, + "grad_norm": 30.103772239597888, + "learning_rate": 6.059546670913684e-07, + "logits/chosen": 4.37890625, + "logits/rejected": 4.375, + "logps/chosen": -875.0, + "logps/rejected": -527.0, + "loss": 0.5172, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.83544921875, + "rewards/margins": 5.5859375, + "rewards/rejected": -3.75, + "step": 2593 + }, + { + "epoch": 0.5147080708368471, + "grad_norm": 35.38167333428913, + "learning_rate": 6.056454065329591e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.2890625, + "logps/chosen": -1321.0, + "logps/rejected": -1011.0, + "loss": 0.4136, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.984375, + "rewards/margins": 7.4921875, + "rewards/rejected": -4.50390625, + "step": 2594 + }, + { + "epoch": 0.5149064933776477, + "grad_norm": 33.81261139343441, + "learning_rate": 6.053361192824368e-07, + "logits/chosen": 3.65234375, + "logits/rejected": 3.5, + "logps/chosen": -1072.0, + "logps/rejected": -1369.0, + "loss": 0.4519, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.12066650390625, + "rewards/margins": 8.92578125, + "rewards/rejected": -6.796875, + "step": 2595 + }, + { + "epoch": 0.5151049159184483, + "grad_norm": 28.849729199018153, + "learning_rate": 6.050268054881611e-07, + "logits/chosen": 4.359375, + "logits/rejected": 4.515625, + "logps/chosen": -1030.0, + "logps/rejected": -1509.0, + "loss": 0.4293, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.099609375, + "rewards/margins": 8.91015625, + "rewards/rejected": -6.8046875, + "step": 2596 + }, + { + "epoch": 0.5153033384592489, + "grad_norm": 30.493191670129093, + "learning_rate": 6.047174652985041e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 3.8515625, + "logps/chosen": -1116.0, + "logps/rejected": -755.0, + "loss": 0.3124, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.30078125, + "rewards/margins": 7.78125, + "rewards/rejected": -5.48828125, + "step": 2597 + }, + { + "epoch": 0.5155017610000496, + "grad_norm": 29.748209994794717, + "learning_rate": 6.044080988618512e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 3.68359375, + "logps/chosen": -1362.0, + "logps/rejected": -747.5, + "loss": 0.1838, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4140625, + "rewards/margins": 9.359375, + "rewards/rejected": -5.9296875, + "step": 2598 + }, + { + "epoch": 0.5157001835408502, + "grad_norm": 30.319601254821336, + "learning_rate": 6.040987063265996e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 4.0234375, + "logps/chosen": -969.5, + "logps/rejected": -752.5, + "loss": 0.4655, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8173828125, + "rewards/margins": 6.52734375, + "rewards/rejected": -4.71484375, + "step": 2599 + }, + { + "epoch": 0.5158986060816508, + "grad_norm": 32.022854114201, + "learning_rate": 6.037892878411597e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.31640625, + "logps/chosen": -1067.0, + "logps/rejected": -718.5, + "loss": 0.4182, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.189453125, + "rewards/margins": 7.625, + "rewards/rejected": -5.44140625, + "step": 2600 + }, + { + "epoch": 0.5160970286224515, + "grad_norm": 29.388863130181207, + "learning_rate": 6.034798435539538e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 3.9140625, + "logps/chosen": -936.5, + "logps/rejected": -552.0, + "loss": 0.5048, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.32421875, + "rewards/margins": 5.14453125, + "rewards/rejected": -2.81298828125, + "step": 2601 + }, + { + "epoch": 0.5162954511632522, + "grad_norm": 27.763198462628406, + "learning_rate": 6.031703736134168e-07, + "logits/chosen": 3.734375, + "logits/rejected": 3.71875, + "logps/chosen": -879.0, + "logps/rejected": -585.0, + "loss": 0.4035, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.048828125, + "rewards/margins": 7.0703125, + "rewards/rejected": -5.02734375, + "step": 2602 + }, + { + "epoch": 0.5164938737040528, + "grad_norm": 37.14153276306482, + "learning_rate": 6.02860878167996e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 3.94921875, + "logps/chosen": -948.0, + "logps/rejected": -695.0, + "loss": 0.4085, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.89453125, + "rewards/margins": 8.8984375, + "rewards/rejected": -7.0234375, + "step": 2603 + }, + { + "epoch": 0.5166922962448535, + "grad_norm": 45.50608443550039, + "learning_rate": 6.025513573661512e-07, + "logits/chosen": 3.73828125, + "logits/rejected": 3.65234375, + "logps/chosen": -952.0, + "logps/rejected": -634.0, + "loss": 0.4347, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.85595703125, + "rewards/margins": 6.5859375, + "rewards/rejected": -4.73828125, + "step": 2604 + }, + { + "epoch": 0.5168907187856541, + "grad_norm": 34.229387159605885, + "learning_rate": 6.022418113563535e-07, + "logits/chosen": 3.62109375, + "logits/rejected": 3.671875, + "logps/chosen": -647.5, + "logps/rejected": -1286.0, + "loss": 0.5229, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.3955078125, + "rewards/margins": 6.7421875, + "rewards/rejected": -5.34765625, + "step": 2605 + }, + { + "epoch": 0.5170891413264547, + "grad_norm": 33.361123633174614, + "learning_rate": 6.019322402870873e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.83203125, + "logps/chosen": -941.0, + "logps/rejected": -729.0, + "loss": 0.4263, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.548828125, + "rewards/margins": 6.80078125, + "rewards/rejected": -4.244140625, + "step": 2606 + }, + { + "epoch": 0.5172875638672553, + "grad_norm": 26.948632058824046, + "learning_rate": 6.016226443068474e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 3.8828125, + "logps/chosen": -1384.0, + "logps/rejected": -842.0, + "loss": 0.3084, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4072265625, + "rewards/margins": 9.3515625, + "rewards/rejected": -6.9375, + "step": 2607 + }, + { + "epoch": 0.517485986408056, + "grad_norm": 31.772743696842475, + "learning_rate": 6.013130235641422e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 4.0234375, + "logps/chosen": -1370.0, + "logps/rejected": -867.0, + "loss": 0.2613, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4609375, + "rewards/margins": 10.921875, + "rewards/rejected": -7.4765625, + "step": 2608 + }, + { + "epoch": 0.5176844089488566, + "grad_norm": 31.21768681171158, + "learning_rate": 6.010033782074911e-07, + "logits/chosen": 3.8359375, + "logits/rejected": 3.88671875, + "logps/chosen": -991.0, + "logps/rejected": -1094.0, + "loss": 0.3903, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.56640625, + "rewards/margins": 10.625, + "rewards/rejected": -8.0546875, + "step": 2609 + }, + { + "epoch": 0.5178828314896572, + "grad_norm": 32.37538423166019, + "learning_rate": 6.006937083854256e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 3.5703125, + "logps/chosen": -784.0, + "logps/rejected": -573.0, + "loss": 0.453, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.013671875, + "rewards/margins": 5.6328125, + "rewards/rejected": -3.62890625, + "step": 2610 + }, + { + "epoch": 0.5180812540304579, + "grad_norm": 31.45068088601554, + "learning_rate": 6.003840142464885e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.4609375, + "logps/chosen": -1194.0, + "logps/rejected": -1057.0, + "loss": 0.4044, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.623046875, + "rewards/margins": 7.4765625, + "rewards/rejected": -5.8515625, + "step": 2611 + }, + { + "epoch": 0.5182796765712585, + "grad_norm": 33.927920382570655, + "learning_rate": 6.000742959392348e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 4.140625, + "logps/chosen": -969.0, + "logps/rejected": -1440.0, + "loss": 0.4672, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.67724609375, + "rewards/margins": 7.4921875, + "rewards/rejected": -5.81640625, + "step": 2612 + }, + { + "epoch": 0.5184780991120591, + "grad_norm": 34.23575408079641, + "learning_rate": 5.997645536122308e-07, + "logits/chosen": 3.46484375, + "logits/rejected": 3.80859375, + "logps/chosen": -801.0, + "logps/rejected": -874.0, + "loss": 0.4029, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.455078125, + "rewards/margins": 7.953125, + "rewards/rejected": -5.50390625, + "step": 2613 + }, + { + "epoch": 0.5186765216528597, + "grad_norm": 29.709347559686652, + "learning_rate": 5.994547874140545e-07, + "logits/chosen": 4.0, + "logits/rejected": 4.21875, + "logps/chosen": -779.0, + "logps/rejected": -900.5, + "loss": 0.4513, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.580078125, + "rewards/margins": 11.5, + "rewards/rejected": -8.921875, + "step": 2614 + }, + { + "epoch": 0.5188749441936604, + "grad_norm": 31.823232681404672, + "learning_rate": 5.991449974932951e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.76953125, + "logps/chosen": -1142.0, + "logps/rejected": -792.5, + "loss": 0.4814, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7763671875, + "rewards/margins": 8.16015625, + "rewards/rejected": -5.37109375, + "step": 2615 + }, + { + "epoch": 0.519073366734461, + "grad_norm": 33.98501957299508, + "learning_rate": 5.988351839985535e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 4.03515625, + "logps/chosen": -1041.0, + "logps/rejected": -1246.0, + "loss": 0.401, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.19140625, + "rewards/margins": 9.8046875, + "rewards/rejected": -7.59765625, + "step": 2616 + }, + { + "epoch": 0.5192717892752616, + "grad_norm": 31.094183577712375, + "learning_rate": 5.985253470784414e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.3671875, + "logps/chosen": -1022.0, + "logps/rejected": -715.0, + "loss": 0.393, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.298828125, + "rewards/margins": 11.7421875, + "rewards/rejected": -9.453125, + "step": 2617 + }, + { + "epoch": 0.5194702118160623, + "grad_norm": 33.51208368973596, + "learning_rate": 5.982154868815823e-07, + "logits/chosen": 4.078125, + "logits/rejected": 3.66015625, + "logps/chosen": -1056.0, + "logps/rejected": -567.0, + "loss": 0.4481, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.08984375, + "rewards/margins": 7.42578125, + "rewards/rejected": -5.33984375, + "step": 2618 + }, + { + "epoch": 0.519668634356863, + "grad_norm": 26.454368472911757, + "learning_rate": 5.979056035566107e-07, + "logits/chosen": 4.39453125, + "logits/rejected": 4.109375, + "logps/chosen": -1239.0, + "logps/rejected": -1051.0, + "loss": 0.3653, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.828125, + "rewards/margins": 9.78125, + "rewards/rejected": -6.9375, + "step": 2619 + }, + { + "epoch": 0.5198670568976635, + "grad_norm": 38.273700644317614, + "learning_rate": 5.97595697252172e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.48046875, + "logps/chosen": -1396.0, + "logps/rejected": -826.0, + "loss": 0.2855, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.564453125, + "rewards/margins": 8.5703125, + "rewards/rejected": -6.0, + "step": 2620 + }, + { + "epoch": 0.5200654794384643, + "grad_norm": 42.91362942908096, + "learning_rate": 5.972857681169229e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.2421875, + "logps/chosen": -963.5, + "logps/rejected": -855.5, + "loss": 0.5632, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.515625, + "rewards/margins": 6.7578125, + "rewards/rejected": -5.234375, + "step": 2621 + }, + { + "epoch": 0.5202639019792649, + "grad_norm": 19.51804725082283, + "learning_rate": 5.969758162995307e-07, + "logits/chosen": 4.59375, + "logits/rejected": 4.33984375, + "logps/chosen": -1203.0, + "logps/rejected": -783.0, + "loss": 0.3322, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8759765625, + "rewards/margins": 8.78125, + "rewards/rejected": -5.90625, + "step": 2622 + }, + { + "epoch": 0.5204623245200655, + "grad_norm": 37.363321278219225, + "learning_rate": 5.966658419486737e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 4.03515625, + "logps/chosen": -994.5, + "logps/rejected": -633.5, + "loss": 0.5214, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.20703125, + "rewards/margins": 5.8984375, + "rewards/rejected": -3.6796875, + "step": 2623 + }, + { + "epoch": 0.5206607470608661, + "grad_norm": 32.753919669801306, + "learning_rate": 5.963558452130415e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.859375, + "logps/chosen": -774.0, + "logps/rejected": -856.0, + "loss": 0.431, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6494140625, + "rewards/margins": 14.796875, + "rewards/rejected": -13.08984375, + "step": 2624 + }, + { + "epoch": 0.5208591696016668, + "grad_norm": 32.00230986983913, + "learning_rate": 5.960458262413336e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 3.875, + "logps/chosen": -832.0, + "logps/rejected": -773.0, + "loss": 0.3465, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.25, + "rewards/margins": 7.234375, + "rewards/rejected": -4.97265625, + "step": 2625 + }, + { + "epoch": 0.5210575921424674, + "grad_norm": 29.743695654001677, + "learning_rate": 5.957357851822608e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 4.0, + "logps/chosen": -617.0, + "logps/rejected": -642.0, + "loss": 0.6368, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.8916015625, + "rewards/margins": 5.49609375, + "rewards/rejected": -3.623046875, + "step": 2626 + }, + { + "epoch": 0.521256014683268, + "grad_norm": 35.46608710044223, + "learning_rate": 5.954257221845443e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 3.7265625, + "logps/chosen": -934.0, + "logps/rejected": -899.5, + "loss": 0.4714, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.9033203125, + "rewards/margins": 6.76953125, + "rewards/rejected": -4.8671875, + "step": 2627 + }, + { + "epoch": 0.5214544372240687, + "grad_norm": 23.79590413222533, + "learning_rate": 5.951156373969158e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.1640625, + "logps/chosen": -1320.0, + "logps/rejected": -861.0, + "loss": 0.3514, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.376953125, + "rewards/margins": 7.375, + "rewards/rejected": -4.98828125, + "step": 2628 + }, + { + "epoch": 0.5216528597648693, + "grad_norm": 27.322332701086715, + "learning_rate": 5.948055309681175e-07, + "logits/chosen": 3.84375, + "logits/rejected": 3.95703125, + "logps/chosen": -1200.0, + "logps/rejected": -656.5, + "loss": 0.3252, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.97265625, + "rewards/margins": 8.1875, + "rewards/rejected": -5.1953125, + "step": 2629 + }, + { + "epoch": 0.5218512823056699, + "grad_norm": 29.756575245662617, + "learning_rate": 5.944954030469018e-07, + "logits/chosen": 3.76953125, + "logits/rejected": 3.87890625, + "logps/chosen": -974.0, + "logps/rejected": -785.0, + "loss": 0.3756, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.49853515625, + "rewards/margins": 9.2578125, + "rewards/rejected": -6.7421875, + "step": 2630 + }, + { + "epoch": 0.5220497048464705, + "grad_norm": 32.037878728128675, + "learning_rate": 5.941852537820318e-07, + "logits/chosen": 3.84375, + "logits/rejected": 3.60546875, + "logps/chosen": -903.0, + "logps/rejected": -542.5, + "loss": 0.3551, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.3232421875, + "rewards/margins": 7.125, + "rewards/rejected": -5.80078125, + "step": 2631 + }, + { + "epoch": 0.5222481273872712, + "grad_norm": 30.557606459563495, + "learning_rate": 5.938750833222803e-07, + "logits/chosen": 4.32421875, + "logits/rejected": 4.01171875, + "logps/chosen": -734.0, + "logps/rejected": -430.0, + "loss": 0.4407, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.212890625, + "rewards/margins": 6.2421875, + "rewards/rejected": -4.03515625, + "step": 2632 + }, + { + "epoch": 0.5224465499280718, + "grad_norm": 32.43230687087241, + "learning_rate": 5.935648918164306e-07, + "logits/chosen": 4.53515625, + "logits/rejected": 4.484375, + "logps/chosen": -771.5, + "logps/rejected": -622.0, + "loss": 0.3237, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.8101806640625, + "rewards/margins": 8.90625, + "rewards/rejected": -7.109375, + "step": 2633 + }, + { + "epoch": 0.5226449724688724, + "grad_norm": 38.89208098849388, + "learning_rate": 5.932546794132762e-07, + "logits/chosen": 4.09375, + "logits/rejected": 4.15234375, + "logps/chosen": -1078.0, + "logps/rejected": -779.0, + "loss": 0.5751, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.615234375, + "rewards/margins": 4.5693359375, + "rewards/rejected": -2.96923828125, + "step": 2634 + }, + { + "epoch": 0.5228433950096731, + "grad_norm": 32.000151554397895, + "learning_rate": 5.929444462616206e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 3.73828125, + "logps/chosen": -1082.0, + "logps/rejected": -705.75, + "loss": 0.427, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.330078125, + "rewards/margins": 6.57421875, + "rewards/rejected": -4.2421875, + "step": 2635 + }, + { + "epoch": 0.5230418175504737, + "grad_norm": 32.44322702843421, + "learning_rate": 5.926341925102768e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.171875, + "logps/chosen": -1075.0, + "logps/rejected": -890.5, + "loss": 0.4808, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.29296875, + "rewards/margins": 6.8359375, + "rewards/rejected": -5.5390625, + "step": 2636 + }, + { + "epoch": 0.5232402400912743, + "grad_norm": 29.177055585288283, + "learning_rate": 5.923239183080683e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.75390625, + "logps/chosen": -1133.0, + "logps/rejected": -679.5, + "loss": 0.2663, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.34765625, + "rewards/margins": 9.390625, + "rewards/rejected": -7.0546875, + "step": 2637 + }, + { + "epoch": 0.5234386626320751, + "grad_norm": 34.14198061827039, + "learning_rate": 5.920136238038277e-07, + "logits/chosen": 3.31640625, + "logits/rejected": 2.9609375, + "logps/chosen": -1072.0, + "logps/rejected": -667.5, + "loss": 0.3939, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.447265625, + "rewards/margins": 7.0625, + "rewards/rejected": -4.609375, + "step": 2638 + }, + { + "epoch": 0.5236370851728757, + "grad_norm": 31.825421192760977, + "learning_rate": 5.917033091463984e-07, + "logits/chosen": 4.453125, + "logits/rejected": 4.38671875, + "logps/chosen": -958.0, + "logps/rejected": -584.5, + "loss": 0.5572, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.068359375, + "rewards/margins": 6.34375, + "rewards/rejected": -4.265625, + "step": 2639 + }, + { + "epoch": 0.5238355077136763, + "grad_norm": 31.238678030632013, + "learning_rate": 5.913929744846323e-07, + "logits/chosen": 3.78125, + "logits/rejected": 3.6171875, + "logps/chosen": -1095.0, + "logps/rejected": -691.0, + "loss": 0.369, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.13671875, + "rewards/margins": 7.8046875, + "rewards/rejected": -4.66796875, + "step": 2640 + }, + { + "epoch": 0.5240339302544769, + "grad_norm": 32.267056138910576, + "learning_rate": 5.910826199673918e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 3.953125, + "logps/chosen": -1064.0, + "logps/rejected": -687.0, + "loss": 0.3908, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4921875, + "rewards/margins": 7.80078125, + "rewards/rejected": -5.30078125, + "step": 2641 + }, + { + "epoch": 0.5242323527952776, + "grad_norm": 26.407455899427536, + "learning_rate": 5.907722457435481e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 4.00390625, + "logps/chosen": -1168.5, + "logps/rejected": -731.5, + "loss": 0.2924, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.158203125, + "rewards/margins": 9.4296875, + "rewards/rejected": -6.2578125, + "step": 2642 + }, + { + "epoch": 0.5244307753360782, + "grad_norm": 27.621980779789098, + "learning_rate": 5.904618519619824e-07, + "logits/chosen": 3.921875, + "logits/rejected": 4.125, + "logps/chosen": -1120.0, + "logps/rejected": -738.0, + "loss": 0.4818, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.166015625, + "rewards/margins": 6.1875, + "rewards/rejected": -4.02734375, + "step": 2643 + }, + { + "epoch": 0.5246291978768788, + "grad_norm": 31.66392754733828, + "learning_rate": 5.90151438771585e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.625, + "logps/chosen": -1053.0, + "logps/rejected": -1094.0, + "loss": 0.2683, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6953125, + "rewards/margins": 9.46875, + "rewards/rejected": -6.765625, + "step": 2644 + }, + { + "epoch": 0.5248276204176795, + "grad_norm": 43.83851137167919, + "learning_rate": 5.898410063212558e-07, + "logits/chosen": 4.4453125, + "logits/rejected": 4.49609375, + "logps/chosen": -1073.0, + "logps/rejected": -1230.0, + "loss": 0.4522, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8681640625, + "rewards/margins": 8.734375, + "rewards/rejected": -6.8671875, + "step": 2645 + }, + { + "epoch": 0.5250260429584801, + "grad_norm": 32.643966502655026, + "learning_rate": 5.895305547599034e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 3.87890625, + "logps/chosen": -1135.0, + "logps/rejected": -661.5, + "loss": 0.3885, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.703125, + "rewards/margins": 7.2734375, + "rewards/rejected": -4.578125, + "step": 2646 + }, + { + "epoch": 0.5252244654992807, + "grad_norm": 41.76761274898696, + "learning_rate": 5.892200842364462e-07, + "logits/chosen": 4.015625, + "logits/rejected": 3.80859375, + "logps/chosen": -894.0, + "logps/rejected": -662.0, + "loss": 0.4895, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.3212890625, + "rewards/margins": 5.46875, + "rewards/rejected": -4.1484375, + "step": 2647 + }, + { + "epoch": 0.5254228880400813, + "grad_norm": 40.231431150391316, + "learning_rate": 5.889095948998112e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.52734375, + "logps/chosen": -1338.5, + "logps/rejected": -735.0, + "loss": 0.4248, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3896484375, + "rewards/margins": 7.515625, + "rewards/rejected": -5.125, + "step": 2648 + }, + { + "epoch": 0.525621310580882, + "grad_norm": 30.097587319765807, + "learning_rate": 5.885990868989347e-07, + "logits/chosen": 4.3828125, + "logits/rejected": 4.8359375, + "logps/chosen": -908.0, + "logps/rejected": -727.0, + "loss": 0.5293, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.068359375, + "rewards/margins": 5.54296875, + "rewards/rejected": -3.47265625, + "step": 2649 + }, + { + "epoch": 0.5258197331216826, + "grad_norm": 39.03023693964033, + "learning_rate": 5.882885603827619e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.421875, + "logps/chosen": -1255.0, + "logps/rejected": -1367.0, + "loss": 0.3417, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.21875, + "rewards/margins": 13.2734375, + "rewards/rejected": -10.0625, + "step": 2650 + }, + { + "epoch": 0.5260181556624832, + "grad_norm": 32.58904177157744, + "learning_rate": 5.879780155002468e-07, + "logits/chosen": 4.37109375, + "logits/rejected": 4.09765625, + "logps/chosen": -743.0, + "logps/rejected": -615.0, + "loss": 0.4814, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.900390625, + "rewards/margins": 6.23828125, + "rewards/rejected": -4.3359375, + "step": 2651 + }, + { + "epoch": 0.5262165782032839, + "grad_norm": 29.916045694898127, + "learning_rate": 5.876674524003522e-07, + "logits/chosen": 4.5234375, + "logits/rejected": 4.4609375, + "logps/chosen": -1043.0, + "logps/rejected": -1659.5, + "loss": 0.2644, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.29296875, + "rewards/margins": 13.3984375, + "rewards/rejected": -10.12890625, + "step": 2652 + }, + { + "epoch": 0.5264150007440845, + "grad_norm": 37.43334976885655, + "learning_rate": 5.873568712320498e-07, + "logits/chosen": 3.72265625, + "logits/rejected": 4.0625, + "logps/chosen": -781.0, + "logps/rejected": -657.5, + "loss": 0.5686, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.435546875, + "rewards/margins": 6.9140625, + "rewards/rejected": -5.484375, + "step": 2653 + }, + { + "epoch": 0.5266134232848851, + "grad_norm": 28.625217624981413, + "learning_rate": 5.8704627214432e-07, + "logits/chosen": 3.63671875, + "logits/rejected": 3.5, + "logps/chosen": -1122.0, + "logps/rejected": -699.5, + "loss": 0.3305, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.095703125, + "rewards/margins": 7.2421875, + "rewards/rejected": -5.14453125, + "step": 2654 + }, + { + "epoch": 0.5268118458256857, + "grad_norm": 34.340802859558856, + "learning_rate": 5.867356552861514e-07, + "logits/chosen": 3.5546875, + "logits/rejected": 3.8828125, + "logps/chosen": -1027.0, + "logps/rejected": -780.0, + "loss": 0.3786, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.560546875, + "rewards/margins": 8.04296875, + "rewards/rejected": -5.4912109375, + "step": 2655 + }, + { + "epoch": 0.5270102683664865, + "grad_norm": 30.65027159828589, + "learning_rate": 5.864250208065415e-07, + "logits/chosen": 4.4140625, + "logits/rejected": 4.3359375, + "logps/chosen": -988.0, + "logps/rejected": -1267.0, + "loss": 0.3103, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.91650390625, + "rewards/margins": 10.1171875, + "rewards/rejected": -8.21484375, + "step": 2656 + }, + { + "epoch": 0.5272086909072871, + "grad_norm": 33.522586053207135, + "learning_rate": 5.861143688544962e-07, + "logits/chosen": 3.87109375, + "logits/rejected": 3.8359375, + "logps/chosen": -1030.0, + "logps/rejected": -762.5, + "loss": 0.3076, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.34765625, + "rewards/margins": 13.8515625, + "rewards/rejected": -11.515625, + "step": 2657 + }, + { + "epoch": 0.5274071134480877, + "grad_norm": 33.57993482570813, + "learning_rate": 5.858036995790296e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.3515625, + "logps/chosen": -1599.0, + "logps/rejected": -961.0, + "loss": 0.3238, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.26171875, + "rewards/margins": 8.78125, + "rewards/rejected": -6.5078125, + "step": 2658 + }, + { + "epoch": 0.5276055359888884, + "grad_norm": 27.258445664880167, + "learning_rate": 5.854930131291642e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.0859375, + "logps/chosen": -1210.0, + "logps/rejected": -779.5, + "loss": 0.3236, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.576171875, + "rewards/margins": 6.9765625, + "rewards/rejected": -4.3984375, + "step": 2659 + }, + { + "epoch": 0.527803958529689, + "grad_norm": 34.724662317591665, + "learning_rate": 5.85182309653931e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 4.2421875, + "logps/chosen": -1030.0, + "logps/rejected": -1197.0, + "loss": 0.4358, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.138671875, + "rewards/margins": 8.1640625, + "rewards/rejected": -6.0234375, + "step": 2660 + }, + { + "epoch": 0.5280023810704896, + "grad_norm": 29.122960113936255, + "learning_rate": 5.848715893023689e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.24609375, + "logps/chosen": -869.0, + "logps/rejected": -747.0, + "loss": 0.3505, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.615234375, + "rewards/margins": 7.28125, + "rewards/rejected": -4.65234375, + "step": 2661 + }, + { + "epoch": 0.5282008036112903, + "grad_norm": 33.25665279750576, + "learning_rate": 5.845608522235248e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.0078125, + "logps/chosen": -1211.0, + "logps/rejected": -843.0, + "loss": 0.3003, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8671875, + "rewards/margins": 8.859375, + "rewards/rejected": -5.9921875, + "step": 2662 + }, + { + "epoch": 0.5283992261520909, + "grad_norm": 34.40683609574913, + "learning_rate": 5.842500985664538e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.390625, + "logps/chosen": -909.0, + "logps/rejected": -1628.0, + "loss": 0.4071, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.328125, + "rewards/margins": 9.3828125, + "rewards/rejected": -7.0390625, + "step": 2663 + }, + { + "epoch": 0.5285976486928915, + "grad_norm": 28.459882315974674, + "learning_rate": 5.839393284802188e-07, + "logits/chosen": 4.22265625, + "logits/rejected": 4.40625, + "logps/chosen": -1127.0, + "logps/rejected": -1784.0, + "loss": 0.4914, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.24609375, + "rewards/margins": 8.81640625, + "rewards/rejected": -6.55859375, + "step": 2664 + }, + { + "epoch": 0.5287960712336921, + "grad_norm": 25.70185942277183, + "learning_rate": 5.836285421138909e-07, + "logits/chosen": 3.55859375, + "logits/rejected": 3.61328125, + "logps/chosen": -691.0, + "logps/rejected": -524.5, + "loss": 0.3377, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.73828125, + "rewards/margins": 6.5546875, + "rewards/rejected": -4.82421875, + "step": 2665 + }, + { + "epoch": 0.5289944937744928, + "grad_norm": 32.861211278021365, + "learning_rate": 5.833177396165486e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.27734375, + "logps/chosen": -932.0, + "logps/rejected": -676.0, + "loss": 0.3972, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.09375, + "rewards/margins": 9.640625, + "rewards/rejected": -7.5390625, + "step": 2666 + }, + { + "epoch": 0.5291929163152934, + "grad_norm": 39.41151785724765, + "learning_rate": 5.830069211372787e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.40625, + "logps/chosen": -811.0, + "logps/rejected": -640.0, + "loss": 0.417, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.876953125, + "rewards/margins": 6.8671875, + "rewards/rejected": -4.98828125, + "step": 2667 + }, + { + "epoch": 0.529391338856094, + "grad_norm": 46.823434611898605, + "learning_rate": 5.826960868251748e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 4.30078125, + "logps/chosen": -1092.0, + "logps/rejected": -562.5, + "loss": 0.4367, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8759765625, + "rewards/margins": 6.578125, + "rewards/rejected": -4.69140625, + "step": 2668 + }, + { + "epoch": 0.5295897613968947, + "grad_norm": 25.296494576433115, + "learning_rate": 5.823852368293388e-07, + "logits/chosen": 3.7421875, + "logits/rejected": 3.75, + "logps/chosen": -722.0, + "logps/rejected": -614.0, + "loss": 0.3908, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.33203125, + "rewards/margins": 6.40625, + "rewards/rejected": -4.08203125, + "step": 2669 + }, + { + "epoch": 0.5297881839376953, + "grad_norm": 37.406304633743304, + "learning_rate": 5.820743712988801e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.00390625, + "logps/chosen": -1269.0, + "logps/rejected": -710.0, + "loss": 0.3744, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.232421875, + "rewards/margins": 8.1328125, + "rewards/rejected": -4.90234375, + "step": 2670 + }, + { + "epoch": 0.5299866064784959, + "grad_norm": 29.73594192443669, + "learning_rate": 5.817634903829153e-07, + "logits/chosen": 3.953125, + "logits/rejected": 4.234375, + "logps/chosen": -1330.0, + "logps/rejected": -1282.0, + "loss": 0.3871, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.451171875, + "rewards/margins": 6.8359375, + "rewards/rejected": -6.3984375, + "step": 2671 + }, + { + "epoch": 0.5301850290192965, + "grad_norm": 36.71627259052248, + "learning_rate": 5.814525942305682e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.109375, + "logps/chosen": -1037.0, + "logps/rejected": -690.0, + "loss": 0.3692, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.658203125, + "rewards/margins": 8.203125, + "rewards/rejected": -5.54296875, + "step": 2672 + }, + { + "epoch": 0.5303834515600973, + "grad_norm": 36.33036543213691, + "learning_rate": 5.811416829909705e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.25, + "logps/chosen": -819.0, + "logps/rejected": -581.5, + "loss": 0.5719, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.45703125, + "rewards/margins": 5.30078125, + "rewards/rejected": -3.841796875, + "step": 2673 + }, + { + "epoch": 0.5305818741008979, + "grad_norm": 33.058315147457456, + "learning_rate": 5.808307568132605e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.25, + "logps/chosen": -1403.0, + "logps/rejected": -1536.0, + "loss": 0.3751, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.36328125, + "rewards/margins": 10.421875, + "rewards/rejected": -7.0390625, + "step": 2674 + }, + { + "epoch": 0.5307802966416985, + "grad_norm": 33.012106289624505, + "learning_rate": 5.805198158465842e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 3.8359375, + "logps/chosen": -875.0, + "logps/rejected": -695.0, + "loss": 0.3889, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.20947265625, + "rewards/margins": 7.3671875, + "rewards/rejected": -5.1484375, + "step": 2675 + }, + { + "epoch": 0.5309787191824992, + "grad_norm": 48.45363040934529, + "learning_rate": 5.802088602400944e-07, + "logits/chosen": 3.921875, + "logits/rejected": 4.0078125, + "logps/chosen": -1056.0, + "logps/rejected": -1351.0, + "loss": 0.3684, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.3828125, + "rewards/margins": 9.1640625, + "rewards/rejected": -6.79296875, + "step": 2676 + }, + { + "epoch": 0.5311771417232998, + "grad_norm": 27.487196362056, + "learning_rate": 5.79897890142951e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 4.16796875, + "logps/chosen": -1354.0, + "logps/rejected": -1233.0, + "loss": 0.313, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.41015625, + "rewards/margins": 10.7421875, + "rewards/rejected": -7.3359375, + "step": 2677 + }, + { + "epoch": 0.5313755642641004, + "grad_norm": 35.72022669717008, + "learning_rate": 5.795869057043206e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.41015625, + "logps/chosen": -681.0, + "logps/rejected": -731.0, + "loss": 0.5428, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6728515625, + "rewards/margins": 5.2421875, + "rewards/rejected": -4.5703125, + "step": 2678 + }, + { + "epoch": 0.5315739868049011, + "grad_norm": 32.58153730678406, + "learning_rate": 5.792759070733772e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.953125, + "logps/chosen": -981.0, + "logps/rejected": -619.0, + "loss": 0.5072, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.07763671875, + "rewards/margins": 6.63671875, + "rewards/rejected": -4.5625, + "step": 2679 + }, + { + "epoch": 0.5317724093457017, + "grad_norm": 31.12856691289001, + "learning_rate": 5.789648943993012e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.01171875, + "logps/chosen": -1064.0, + "logps/rejected": -663.0, + "loss": 0.4417, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.201171875, + "rewards/margins": 6.8203125, + "rewards/rejected": -4.6171875, + "step": 2680 + }, + { + "epoch": 0.5319708318865023, + "grad_norm": 44.22250783811482, + "learning_rate": 5.786538678312799e-07, + "logits/chosen": 3.66015625, + "logits/rejected": 4.0390625, + "logps/chosen": -1000.0, + "logps/rejected": -859.0, + "loss": 0.4898, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.30078125, + "rewards/margins": 7.4453125, + "rewards/rejected": -5.1484375, + "step": 2681 + }, + { + "epoch": 0.5321692544273029, + "grad_norm": 30.7078672078003, + "learning_rate": 5.783428275185072e-07, + "logits/chosen": 4.484375, + "logits/rejected": 4.44140625, + "logps/chosen": -1078.0, + "logps/rejected": -781.0, + "loss": 0.4256, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.921875, + "rewards/margins": 7.37109375, + "rewards/rejected": -5.4609375, + "step": 2682 + }, + { + "epoch": 0.5323676769681036, + "grad_norm": 35.7428994242555, + "learning_rate": 5.780317736101836e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 3.71484375, + "logps/chosen": -835.0, + "logps/rejected": -569.5, + "loss": 0.4798, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.13671875, + "rewards/margins": 5.1796875, + "rewards/rejected": -3.041015625, + "step": 2683 + }, + { + "epoch": 0.5325660995089042, + "grad_norm": 34.006085318052605, + "learning_rate": 5.77720706255516e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 3.78125, + "logps/chosen": -992.0, + "logps/rejected": -772.5, + "loss": 0.2937, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.546875, + "rewards/margins": 8.2890625, + "rewards/rejected": -5.75390625, + "step": 2684 + }, + { + "epoch": 0.5327645220497048, + "grad_norm": 35.74470324942312, + "learning_rate": 5.77409625603718e-07, + "logits/chosen": 3.91015625, + "logits/rejected": 3.62109375, + "logps/chosen": -916.0, + "logps/rejected": -560.0, + "loss": 0.4253, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.427734375, + "rewards/margins": 6.0234375, + "rewards/rejected": -3.59765625, + "step": 2685 + }, + { + "epoch": 0.5329629445905055, + "grad_norm": 33.564998469987074, + "learning_rate": 5.770985318040097e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 3.9453125, + "logps/chosen": -947.5, + "logps/rejected": -844.0, + "loss": 0.4251, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.3994140625, + "rewards/margins": 17.42578125, + "rewards/rejected": -15.02734375, + "step": 2686 + }, + { + "epoch": 0.5331613671313061, + "grad_norm": 32.569992281474285, + "learning_rate": 5.767874250056169e-07, + "logits/chosen": 3.40234375, + "logits/rejected": 3.27734375, + "logps/chosen": -786.0, + "logps/rejected": -597.5, + "loss": 0.4282, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.7548828125, + "rewards/margins": 7.1484375, + "rewards/rejected": -5.40234375, + "step": 2687 + }, + { + "epoch": 0.5333597896721067, + "grad_norm": 39.904155418823, + "learning_rate": 5.764763053577722e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 3.875, + "logps/chosen": -969.0, + "logps/rejected": -1529.0, + "loss": 0.3734, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.548828125, + "rewards/margins": 9.6015625, + "rewards/rejected": -7.05859375, + "step": 2688 + }, + { + "epoch": 0.5335582122129073, + "grad_norm": 30.74537904873568, + "learning_rate": 5.761651730097142e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 3.80859375, + "logps/chosen": -886.0, + "logps/rejected": -771.0, + "loss": 0.5134, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.404296875, + "rewards/margins": 6.34375, + "rewards/rejected": -3.93359375, + "step": 2689 + }, + { + "epoch": 0.533756634753708, + "grad_norm": 49.66585556973121, + "learning_rate": 5.758540281106873e-07, + "logits/chosen": 4.33984375, + "logits/rejected": 4.27734375, + "logps/chosen": -1275.0, + "logps/rejected": -980.0, + "loss": 0.467, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.203125, + "rewards/margins": 7.703125, + "rewards/rejected": -5.5078125, + "step": 2690 + }, + { + "epoch": 0.5339550572945087, + "grad_norm": 36.706126721673904, + "learning_rate": 5.755428708099424e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 3.94140625, + "logps/chosen": -1296.0, + "logps/rejected": -1191.0, + "loss": 0.4039, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.30078125, + "rewards/margins": 19.828125, + "rewards/rejected": -17.5078125, + "step": 2691 + }, + { + "epoch": 0.5341534798353093, + "grad_norm": 27.014607876798333, + "learning_rate": 5.752317012567362e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 4.11328125, + "logps/chosen": -917.0, + "logps/rejected": -623.5, + "loss": 0.4052, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.064453125, + "rewards/margins": 6.34375, + "rewards/rejected": -4.2890625, + "step": 2692 + }, + { + "epoch": 0.53435190237611, + "grad_norm": 37.7565982595681, + "learning_rate": 5.749205196003313e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 3.91796875, + "logps/chosen": -1264.0, + "logps/rejected": -943.0, + "loss": 0.3649, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.01953125, + "rewards/margins": 8.859375, + "rewards/rejected": -5.828125, + "step": 2693 + }, + { + "epoch": 0.5345503249169106, + "grad_norm": 31.227283434939764, + "learning_rate": 5.746093259899954e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.046875, + "logps/chosen": -1102.0, + "logps/rejected": -709.0, + "loss": 0.4732, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.6015625, + "rewards/margins": 7.2109375, + "rewards/rejected": -5.62109375, + "step": 2694 + }, + { + "epoch": 0.5347487474577112, + "grad_norm": 47.25590809270375, + "learning_rate": 5.742981205750032e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.28515625, + "logps/chosen": -1218.0, + "logps/rejected": -859.5, + "loss": 0.4694, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.34375, + "rewards/margins": 8.1875, + "rewards/rejected": -5.859375, + "step": 2695 + }, + { + "epoch": 0.5349471699985119, + "grad_norm": 32.53240569993013, + "learning_rate": 5.739869035046343e-07, + "logits/chosen": 3.72265625, + "logits/rejected": 3.86328125, + "logps/chosen": -1070.0, + "logps/rejected": -598.0, + "loss": 0.4596, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.841796875, + "rewards/margins": 7.0859375, + "rewards/rejected": -4.2421875, + "step": 2696 + }, + { + "epoch": 0.5351455925393125, + "grad_norm": 37.76553012006142, + "learning_rate": 5.736756749281737e-07, + "logits/chosen": 3.6328125, + "logits/rejected": 3.67578125, + "logps/chosen": -1624.0, + "logps/rejected": -561.0, + "loss": 0.5807, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.89453125, + "rewards/margins": 1.12109375, + "rewards/rejected": -2.9921875, + "step": 2697 + }, + { + "epoch": 0.5353440150801131, + "grad_norm": 38.63540833357952, + "learning_rate": 5.733644349949123e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.0, + "logps/chosen": -882.0, + "logps/rejected": -631.0, + "loss": 0.4704, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.814453125, + "rewards/margins": 5.6328125, + "rewards/rejected": -3.82421875, + "step": 2698 + }, + { + "epoch": 0.5355424376209137, + "grad_norm": 39.040889130378595, + "learning_rate": 5.730531838541467e-07, + "logits/chosen": 3.73046875, + "logits/rejected": 3.7265625, + "logps/chosen": -1306.0, + "logps/rejected": -883.0, + "loss": 0.446, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.119140625, + "rewards/margins": 5.890625, + "rewards/rejected": -3.7734375, + "step": 2699 + }, + { + "epoch": 0.5357408601617144, + "grad_norm": 40.051094245506356, + "learning_rate": 5.727419216551781e-07, + "logits/chosen": 3.74609375, + "logits/rejected": 3.92578125, + "logps/chosen": -920.5, + "logps/rejected": -700.5, + "loss": 0.4609, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.408203125, + "rewards/margins": 6.1796875, + "rewards/rejected": -3.78125, + "step": 2700 + }, + { + "epoch": 0.535939282702515, + "grad_norm": 48.19422322229894, + "learning_rate": 5.724306485473137e-07, + "logits/chosen": 4.5234375, + "logits/rejected": 4.0859375, + "logps/chosen": -1126.0, + "logps/rejected": -922.0, + "loss": 0.2843, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.904296875, + "rewards/margins": 7.1015625, + "rewards/rejected": -4.1953125, + "step": 2701 + }, + { + "epoch": 0.5361377052433156, + "grad_norm": 30.115397505807746, + "learning_rate": 5.721193646798657e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 3.78125, + "logps/chosen": -837.0, + "logps/rejected": -858.0, + "loss": 0.3806, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.037109375, + "rewards/margins": 6.7109375, + "rewards/rejected": -4.671875, + "step": 2702 + }, + { + "epoch": 0.5363361277841163, + "grad_norm": 35.308525923659886, + "learning_rate": 5.718080702021514e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 3.7109375, + "logps/chosen": -831.0, + "logps/rejected": -615.5, + "loss": 0.3452, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.302734375, + "rewards/margins": 7.921875, + "rewards/rejected": -5.62890625, + "step": 2703 + }, + { + "epoch": 0.5365345503249169, + "grad_norm": 34.459859134610205, + "learning_rate": 5.714967652634931e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 4.078125, + "logps/chosen": -882.5, + "logps/rejected": -572.0, + "loss": 0.4066, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.25, + "rewards/margins": 6.55859375, + "rewards/rejected": -4.2958984375, + "step": 2704 + }, + { + "epoch": 0.5367329728657175, + "grad_norm": 30.897727367062707, + "learning_rate": 5.711854500132184e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.15234375, + "logps/chosen": -1000.5, + "logps/rejected": -689.0, + "loss": 0.4245, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4716796875, + "rewards/margins": 7.67578125, + "rewards/rejected": -5.21875, + "step": 2705 + }, + { + "epoch": 0.5369313954065181, + "grad_norm": 38.501618706125775, + "learning_rate": 5.708741246006596e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 3.8359375, + "logps/chosen": -942.0, + "logps/rejected": -644.5, + "loss": 0.3749, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.392578125, + "rewards/margins": 7.7890625, + "rewards/rejected": -5.37890625, + "step": 2706 + }, + { + "epoch": 0.5371298179473188, + "grad_norm": 30.749271310017445, + "learning_rate": 5.705627891751541e-07, + "logits/chosen": 3.9375, + "logits/rejected": 4.0859375, + "logps/chosen": -877.5, + "logps/rejected": -1263.5, + "loss": 0.459, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.98046875, + "rewards/margins": 8.546875, + "rewards/rejected": -6.5703125, + "step": 2707 + }, + { + "epoch": 0.5373282404881194, + "grad_norm": 42.67176092148224, + "learning_rate": 5.702514438860438e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 3.9140625, + "logps/chosen": -1124.0, + "logps/rejected": -670.0, + "loss": 0.3553, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.9921875, + "rewards/margins": 8.6640625, + "rewards/rejected": -6.66796875, + "step": 2708 + }, + { + "epoch": 0.53752666302892, + "grad_norm": 35.17537561672979, + "learning_rate": 5.699400888826759e-07, + "logits/chosen": 3.80859375, + "logits/rejected": 3.8046875, + "logps/chosen": -1016.0, + "logps/rejected": -904.0, + "loss": 0.4533, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.130859375, + "rewards/margins": 8.3515625, + "rewards/rejected": -7.2265625, + "step": 2709 + }, + { + "epoch": 0.5377250855697208, + "grad_norm": 35.06372948348037, + "learning_rate": 5.696287243144012e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.09375, + "logps/chosen": -1085.0, + "logps/rejected": -614.5, + "loss": 0.3901, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.015625, + "rewards/margins": 7.7265625, + "rewards/rejected": -5.72265625, + "step": 2710 + }, + { + "epoch": 0.5379235081105214, + "grad_norm": 32.92239945934764, + "learning_rate": 5.693173503305762e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 4.2734375, + "logps/chosen": -1429.0, + "logps/rejected": -764.0, + "loss": 0.5792, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.5087890625, + "rewards/margins": 5.71875, + "rewards/rejected": -4.21875, + "step": 2711 + }, + { + "epoch": 0.538121930651322, + "grad_norm": 28.62728623137661, + "learning_rate": 5.690059670805614e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.859375, + "logps/chosen": -1071.0, + "logps/rejected": -708.0, + "loss": 0.4305, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8056640625, + "rewards/margins": 7.0078125, + "rewards/rejected": -5.1953125, + "step": 2712 + }, + { + "epoch": 0.5383203531921227, + "grad_norm": 24.215452372744192, + "learning_rate": 5.686945747137221e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 4.00390625, + "logps/chosen": -1148.0, + "logps/rejected": -719.0, + "loss": 0.3251, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6171875, + "rewards/margins": 8.4375, + "rewards/rejected": -5.83203125, + "step": 2713 + }, + { + "epoch": 0.5385187757329233, + "grad_norm": 25.72908570503651, + "learning_rate": 5.68383173379427e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.30078125, + "logps/chosen": -879.0, + "logps/rejected": -635.0, + "loss": 0.4009, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.501953125, + "rewards/margins": 7.421875, + "rewards/rejected": -4.93359375, + "step": 2714 + }, + { + "epoch": 0.5387171982737239, + "grad_norm": 30.09920713087005, + "learning_rate": 5.680717632270502e-07, + "logits/chosen": 4.22265625, + "logits/rejected": 4.8046875, + "logps/chosen": -1156.0, + "logps/rejected": -713.0, + "loss": 0.3808, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.69921875, + "rewards/margins": 8.1015625, + "rewards/rejected": -5.3984375, + "step": 2715 + }, + { + "epoch": 0.5389156208145245, + "grad_norm": 37.76305357555748, + "learning_rate": 5.677603444059698e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.5, + "logps/chosen": -808.5, + "logps/rejected": -594.0, + "loss": 0.5685, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.79296875, + "rewards/margins": 5.0703125, + "rewards/rejected": -3.291015625, + "step": 2716 + }, + { + "epoch": 0.5391140433553252, + "grad_norm": 24.854055525860804, + "learning_rate": 5.674489170655675e-07, + "logits/chosen": 3.7109375, + "logits/rejected": 3.8203125, + "logps/chosen": -978.0, + "logps/rejected": -639.0, + "loss": 0.2938, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.3515625, + "rewards/margins": 7.8828125, + "rewards/rejected": -5.5078125, + "step": 2717 + }, + { + "epoch": 0.5393124658961258, + "grad_norm": 44.045885630823015, + "learning_rate": 5.671374813552298e-07, + "logits/chosen": 4.53125, + "logits/rejected": 4.359375, + "logps/chosen": -895.0, + "logps/rejected": -660.5, + "loss": 0.4775, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.2451171875, + "rewards/margins": 6.859375, + "rewards/rejected": -4.6015625, + "step": 2718 + }, + { + "epoch": 0.5395108884369264, + "grad_norm": 30.619448579731973, + "learning_rate": 5.668260374243467e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 4.171875, + "logps/chosen": -875.0, + "logps/rejected": -720.0, + "loss": 0.418, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.298828125, + "rewards/margins": 6.6640625, + "rewards/rejected": -4.36328125, + "step": 2719 + }, + { + "epoch": 0.5397093109777271, + "grad_norm": 28.425194272440258, + "learning_rate": 5.665145854223122e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.625, + "logps/chosen": -924.0, + "logps/rejected": -1715.5, + "loss": 0.5529, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.5858154296875, + "rewards/margins": 8.2734375, + "rewards/rejected": -6.6953125, + "step": 2720 + }, + { + "epoch": 0.5399077335185277, + "grad_norm": 31.886634978956838, + "learning_rate": 5.662031254985244e-07, + "logits/chosen": 3.921875, + "logits/rejected": 4.12890625, + "logps/chosen": -777.0, + "logps/rejected": -629.5, + "loss": 0.4123, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.35546875, + "rewards/margins": 6.2734375, + "rewards/rejected": -3.92578125, + "step": 2721 + }, + { + "epoch": 0.5401061560593283, + "grad_norm": 28.153726226374374, + "learning_rate": 5.658916578023852e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.05078125, + "logps/chosen": -900.5, + "logps/rejected": -649.0, + "loss": 0.317, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.65625, + "rewards/margins": 8.3359375, + "rewards/rejected": -5.6875, + "step": 2722 + }, + { + "epoch": 0.5403045786001289, + "grad_norm": 27.68208788957716, + "learning_rate": 5.655801824833e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 3.94921875, + "logps/chosen": -882.0, + "logps/rejected": -649.0, + "loss": 0.3902, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.2705078125, + "rewards/margins": 6.25, + "rewards/rejected": -3.97265625, + "step": 2723 + }, + { + "epoch": 0.5405030011409296, + "grad_norm": 33.929554163004966, + "learning_rate": 5.652686996906782e-07, + "logits/chosen": 3.8828125, + "logits/rejected": 3.796875, + "logps/chosen": -1134.5, + "logps/rejected": -631.0, + "loss": 0.4314, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.05078125, + "rewards/margins": 7.1640625, + "rewards/rejected": -4.1171875, + "step": 2724 + }, + { + "epoch": 0.5407014236817302, + "grad_norm": 42.167990623501474, + "learning_rate": 5.64957209573932e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 4.09375, + "logps/chosen": -1215.0, + "logps/rejected": -1039.0, + "loss": 0.3865, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.78515625, + "rewards/margins": 7.3515625, + "rewards/rejected": -4.5625, + "step": 2725 + }, + { + "epoch": 0.5408998462225308, + "grad_norm": 34.10399261992165, + "learning_rate": 5.646457122824783e-07, + "logits/chosen": 4.22265625, + "logits/rejected": 4.12890625, + "logps/chosen": -1090.0, + "logps/rejected": -822.0, + "loss": 0.3794, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.3125, + "rewards/margins": 7.3671875, + "rewards/rejected": -4.046875, + "step": 2726 + }, + { + "epoch": 0.5410982687633316, + "grad_norm": 31.467594536898623, + "learning_rate": 5.643342079657366e-07, + "logits/chosen": 3.75, + "logits/rejected": 3.71484375, + "logps/chosen": -1069.0, + "logps/rejected": -695.5, + "loss": 0.3759, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.357421875, + "rewards/margins": 7.1484375, + "rewards/rejected": -4.7890625, + "step": 2727 + }, + { + "epoch": 0.5412966913041322, + "grad_norm": 36.513705466902486, + "learning_rate": 5.640226967731299e-07, + "logits/chosen": 3.984375, + "logits/rejected": 4.14453125, + "logps/chosen": -1385.0, + "logps/rejected": -1258.0, + "loss": 0.3837, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.32421875, + "rewards/margins": 9.3828125, + "rewards/rejected": -6.0546875, + "step": 2728 + }, + { + "epoch": 0.5414951138449328, + "grad_norm": 26.47037761817446, + "learning_rate": 5.637111788540847e-07, + "logits/chosen": 4.28515625, + "logits/rejected": 4.05859375, + "logps/chosen": -736.0, + "logps/rejected": -592.5, + "loss": 0.3016, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7265625, + "rewards/margins": 7.4453125, + "rewards/rejected": -4.70703125, + "step": 2729 + }, + { + "epoch": 0.5416935363857334, + "grad_norm": 26.99448815440553, + "learning_rate": 5.633996543580305e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.27734375, + "logps/chosen": -1071.0, + "logps/rejected": -766.0, + "loss": 0.4367, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.498046875, + "rewards/margins": 7.3203125, + "rewards/rejected": -4.8203125, + "step": 2730 + }, + { + "epoch": 0.5418919589265341, + "grad_norm": 25.63225935381981, + "learning_rate": 5.630881234344002e-07, + "logits/chosen": 4.31640625, + "logits/rejected": 4.265625, + "logps/chosen": -849.0, + "logps/rejected": -530.5, + "loss": 0.4025, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.6630859375, + "rewards/margins": 5.484375, + "rewards/rejected": -3.828125, + "step": 2731 + }, + { + "epoch": 0.5420903814673347, + "grad_norm": 37.360566247882225, + "learning_rate": 5.627765862326295e-07, + "logits/chosen": 3.6484375, + "logits/rejected": 3.59765625, + "logps/chosen": -667.0, + "logps/rejected": -540.0, + "loss": 0.5853, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.1064453125, + "rewards/margins": 5.046875, + "rewards/rejected": -3.94921875, + "step": 2732 + }, + { + "epoch": 0.5422888040081353, + "grad_norm": 30.26629487313545, + "learning_rate": 5.624650429021574e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.17578125, + "logps/chosen": -1212.0, + "logps/rejected": -1548.0, + "loss": 0.4289, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6328125, + "rewards/margins": 9.546875, + "rewards/rejected": -6.93359375, + "step": 2733 + }, + { + "epoch": 0.542487226548936, + "grad_norm": 37.02420100383982, + "learning_rate": 5.621534935924256e-07, + "logits/chosen": 3.9375, + "logits/rejected": 3.96875, + "logps/chosen": -1250.0, + "logps/rejected": -885.0, + "loss": 0.4851, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.70703125, + "rewards/margins": 7.2265625, + "rewards/rejected": -4.51171875, + "step": 2734 + }, + { + "epoch": 0.5426856490897366, + "grad_norm": 28.411372600701405, + "learning_rate": 5.618419384528787e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.54296875, + "logps/chosen": -808.5, + "logps/rejected": -1170.0, + "loss": 0.4893, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.75, + "rewards/margins": 9.1328125, + "rewards/rejected": -7.390625, + "step": 2735 + }, + { + "epoch": 0.5428840716305372, + "grad_norm": 34.307400334551126, + "learning_rate": 5.615303776329646e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 3.86328125, + "logps/chosen": -1178.0, + "logps/rejected": -1358.0, + "loss": 0.3972, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.86328125, + "rewards/margins": 10.5859375, + "rewards/rejected": -7.7109375, + "step": 2736 + }, + { + "epoch": 0.5430824941713379, + "grad_norm": 32.71617874386774, + "learning_rate": 5.612188112821328e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 3.94921875, + "logps/chosen": -612.0, + "logps/rejected": -772.0, + "loss": 0.537, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.00732421875, + "rewards/margins": 6.046875, + "rewards/rejected": -4.0546875, + "step": 2737 + }, + { + "epoch": 0.5432809167121385, + "grad_norm": 35.05729691119477, + "learning_rate": 5.609072395498366e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 3.94140625, + "logps/chosen": -1018.0, + "logps/rejected": -735.0, + "loss": 0.3101, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.248046875, + "rewards/margins": 9.1015625, + "rewards/rejected": -6.8359375, + "step": 2738 + }, + { + "epoch": 0.5434793392529391, + "grad_norm": 35.719152987460184, + "learning_rate": 5.605956625855314e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 4.046875, + "logps/chosen": -819.5, + "logps/rejected": -711.5, + "loss": 0.5448, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.29443359375, + "rewards/margins": 6.68359375, + "rewards/rejected": -5.3828125, + "step": 2739 + }, + { + "epoch": 0.5436777617937397, + "grad_norm": 36.3814291760784, + "learning_rate": 5.602840805386752e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 3.69140625, + "logps/chosen": -999.0, + "logps/rejected": -647.5, + "loss": 0.3393, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.94921875, + "rewards/margins": 7.515625, + "rewards/rejected": -5.5625, + "step": 2740 + }, + { + "epoch": 0.5438761843345404, + "grad_norm": 29.40972588265389, + "learning_rate": 5.599724935587281e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 3.90234375, + "logps/chosen": -1074.0, + "logps/rejected": -601.0, + "loss": 0.3017, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.3359375, + "rewards/margins": 8.9453125, + "rewards/rejected": -6.6171875, + "step": 2741 + }, + { + "epoch": 0.544074606875341, + "grad_norm": 29.37972397586857, + "learning_rate": 5.59660901795153e-07, + "logits/chosen": 3.796875, + "logits/rejected": 3.984375, + "logps/chosen": -917.0, + "logps/rejected": -694.5, + "loss": 0.4053, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.216796875, + "rewards/margins": 9.65625, + "rewards/rejected": -6.4453125, + "step": 2742 + }, + { + "epoch": 0.5442730294161416, + "grad_norm": 29.075257831665194, + "learning_rate": 5.59349305397415e-07, + "logits/chosen": 4.71875, + "logits/rejected": 4.5, + "logps/chosen": -1085.0, + "logps/rejected": -789.0, + "loss": 0.4419, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.75, + "rewards/margins": 7.9453125, + "rewards/rejected": -5.18359375, + "step": 2743 + }, + { + "epoch": 0.5444714519569424, + "grad_norm": 24.97401568190278, + "learning_rate": 5.590377045149813e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 4.2109375, + "logps/chosen": -1165.0, + "logps/rejected": -1011.0, + "loss": 0.4089, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.642578125, + "rewards/margins": 9.25, + "rewards/rejected": -6.60546875, + "step": 2744 + }, + { + "epoch": 0.544669874497743, + "grad_norm": 30.443293570643103, + "learning_rate": 5.587260992973209e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 3.82421875, + "logps/chosen": -1000.0, + "logps/rejected": -596.0, + "loss": 0.4177, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8349609375, + "rewards/margins": 7.5546875, + "rewards/rejected": -4.71484375, + "step": 2745 + }, + { + "epoch": 0.5448682970385436, + "grad_norm": 31.663091728550864, + "learning_rate": 5.58414489893906e-07, + "logits/chosen": 4.421875, + "logits/rejected": 3.9609375, + "logps/chosen": -824.0, + "logps/rejected": -652.0, + "loss": 0.4455, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.65234375, + "rewards/margins": 7.46875, + "rewards/rejected": -4.8125, + "step": 2746 + }, + { + "epoch": 0.5450667195793442, + "grad_norm": 34.57560124321522, + "learning_rate": 5.581028764542097e-07, + "logits/chosen": 3.7734375, + "logits/rejected": 4.234375, + "logps/chosen": -867.0, + "logps/rejected": -1171.0, + "loss": 0.4119, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.677734375, + "rewards/margins": 8.5078125, + "rewards/rejected": -6.8203125, + "step": 2747 + }, + { + "epoch": 0.5452651421201449, + "grad_norm": 32.16652810913204, + "learning_rate": 5.577912591277075e-07, + "logits/chosen": 4.21875, + "logits/rejected": 3.9921875, + "logps/chosen": -918.0, + "logps/rejected": -543.0, + "loss": 0.3916, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.9189453125, + "rewards/margins": 7.71875, + "rewards/rejected": -5.8046875, + "step": 2748 + }, + { + "epoch": 0.5454635646609455, + "grad_norm": 25.18743972138868, + "learning_rate": 5.574796380638769e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 3.87890625, + "logps/chosen": -868.0, + "logps/rejected": -559.0, + "loss": 0.5208, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.287109375, + "rewards/margins": 6.38671875, + "rewards/rejected": -5.1015625, + "step": 2749 + }, + { + "epoch": 0.5456619872017461, + "grad_norm": 31.18117633978176, + "learning_rate": 5.571680134121965e-07, + "logits/chosen": 3.87109375, + "logits/rejected": 3.96484375, + "logps/chosen": -1122.0, + "logps/rejected": -736.5, + "loss": 0.3775, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.921875, + "rewards/margins": 7.9375, + "rewards/rejected": -5.02734375, + "step": 2750 + }, + { + "epoch": 0.5458604097425468, + "grad_norm": 28.70791089665322, + "learning_rate": 5.568563853221474e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 4.3125, + "logps/chosen": -933.0, + "logps/rejected": -785.0, + "loss": 0.3765, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1328125, + "rewards/margins": 9.328125, + "rewards/rejected": -7.203125, + "step": 2751 + }, + { + "epoch": 0.5460588322833474, + "grad_norm": 33.423803738619995, + "learning_rate": 5.565447539432122e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 4.2265625, + "logps/chosen": -717.0, + "logps/rejected": -748.0, + "loss": 0.4457, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.64013671875, + "rewards/margins": 6.765625, + "rewards/rejected": -5.12890625, + "step": 2752 + }, + { + "epoch": 0.546257254824148, + "grad_norm": 29.784723107256074, + "learning_rate": 5.562331194248748e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.12109375, + "logps/chosen": -1194.0, + "logps/rejected": -1010.0, + "loss": 0.3283, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2734375, + "rewards/margins": 8.96875, + "rewards/rejected": -5.69921875, + "step": 2753 + }, + { + "epoch": 0.5464556773649487, + "grad_norm": 28.31163136536168, + "learning_rate": 5.559214819166206e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 3.85546875, + "logps/chosen": -1196.0, + "logps/rejected": -928.0, + "loss": 0.3606, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.3203125, + "rewards/margins": 8.1328125, + "rewards/rejected": -5.8125, + "step": 2754 + }, + { + "epoch": 0.5466540999057493, + "grad_norm": 27.608469435606, + "learning_rate": 5.556098415679368e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 3.9921875, + "logps/chosen": -957.0, + "logps/rejected": -931.5, + "loss": 0.4519, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.05859375, + "rewards/margins": 9.09375, + "rewards/rejected": -7.01953125, + "step": 2755 + }, + { + "epoch": 0.5468525224465499, + "grad_norm": 37.46731299533176, + "learning_rate": 5.552981985283115e-07, + "logits/chosen": 4.5078125, + "logits/rejected": 4.109375, + "logps/chosen": -833.5, + "logps/rejected": -1096.5, + "loss": 0.4632, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.322265625, + "rewards/margins": 7.140625, + "rewards/rejected": -5.82421875, + "step": 2756 + }, + { + "epoch": 0.5470509449873505, + "grad_norm": 26.43803474218473, + "learning_rate": 5.549865529472347e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.13671875, + "logps/chosen": -1317.0, + "logps/rejected": -685.0, + "loss": 0.4266, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.181640625, + "rewards/margins": 6.5859375, + "rewards/rejected": -4.40234375, + "step": 2757 + }, + { + "epoch": 0.5472493675281512, + "grad_norm": 26.178642122224023, + "learning_rate": 5.546749049741969e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.09375, + "logps/chosen": -1201.0, + "logps/rejected": -770.0, + "loss": 0.272, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.05859375, + "rewards/margins": 8.8984375, + "rewards/rejected": -5.8359375, + "step": 2758 + }, + { + "epoch": 0.5474477900689518, + "grad_norm": 27.554714577491172, + "learning_rate": 5.543632547586904e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.421875, + "logps/chosen": -1120.0, + "logps/rejected": -1124.0, + "loss": 0.3411, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.265625, + "rewards/margins": 11.109375, + "rewards/rejected": -8.84375, + "step": 2759 + }, + { + "epoch": 0.5476462126097524, + "grad_norm": 29.813202820879283, + "learning_rate": 5.540516024502079e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.03125, + "logps/chosen": -1054.0, + "logps/rejected": -1131.0, + "loss": 0.4374, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.9375, + "rewards/margins": 9.3828125, + "rewards/rejected": -7.4453125, + "step": 2760 + }, + { + "epoch": 0.5478446351505531, + "grad_norm": 29.610743314516863, + "learning_rate": 5.537399481982438e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.5625, + "logps/chosen": -988.0, + "logps/rejected": -767.0, + "loss": 0.4409, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.615234375, + "rewards/margins": 8.16015625, + "rewards/rejected": -5.55078125, + "step": 2761 + }, + { + "epoch": 0.5480430576913538, + "grad_norm": 31.33242467089593, + "learning_rate": 5.534282921522927e-07, + "logits/chosen": 4.546875, + "logits/rejected": 4.34765625, + "logps/chosen": -787.0, + "logps/rejected": -602.0, + "loss": 0.4438, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.04296875, + "rewards/margins": 5.48828125, + "rewards/rejected": -3.44140625, + "step": 2762 + }, + { + "epoch": 0.5482414802321544, + "grad_norm": 33.32793807756795, + "learning_rate": 5.531166344618511e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.09765625, + "logps/chosen": -1343.0, + "logps/rejected": -729.5, + "loss": 0.3527, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.953125, + "rewards/margins": 8.5, + "rewards/rejected": -5.55859375, + "step": 2763 + }, + { + "epoch": 0.548439902772955, + "grad_norm": 29.713893701917577, + "learning_rate": 5.528049752764151e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.93359375, + "logps/chosen": -934.0, + "logps/rejected": -567.0, + "loss": 0.3133, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.685546875, + "rewards/margins": 7.859375, + "rewards/rejected": -5.171875, + "step": 2764 + }, + { + "epoch": 0.5486383253137557, + "grad_norm": 35.76932937687861, + "learning_rate": 5.524933147454825e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 3.99609375, + "logps/chosen": -1062.5, + "logps/rejected": -794.0, + "loss": 0.6006, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.031005859375, + "rewards/margins": 4.44140625, + "rewards/rejected": -3.40234375, + "step": 2765 + }, + { + "epoch": 0.5488367478545563, + "grad_norm": 29.457921195048403, + "learning_rate": 5.521816530185508e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 3.82421875, + "logps/chosen": -998.0, + "logps/rejected": -732.0, + "loss": 0.3701, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.853515625, + "rewards/margins": 7.4921875, + "rewards/rejected": -5.640625, + "step": 2766 + }, + { + "epoch": 0.5490351703953569, + "grad_norm": 46.96606611968636, + "learning_rate": 5.518699902451192e-07, + "logits/chosen": 4.453125, + "logits/rejected": 4.0859375, + "logps/chosen": -1145.0, + "logps/rejected": -893.0, + "loss": 0.3336, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.326171875, + "rewards/margins": 7.6171875, + "rewards/rejected": -5.296875, + "step": 2767 + }, + { + "epoch": 0.5492335929361576, + "grad_norm": 30.99831127604519, + "learning_rate": 5.515583265746863e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.23828125, + "logps/chosen": -798.0, + "logps/rejected": -672.0, + "loss": 0.4945, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.162109375, + "rewards/margins": 5.96875, + "rewards/rejected": -3.798828125, + "step": 2768 + }, + { + "epoch": 0.5494320154769582, + "grad_norm": 32.509426169888584, + "learning_rate": 5.51246662156752e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 4.203125, + "logps/chosen": -1359.0, + "logps/rejected": -1089.0, + "loss": 0.3245, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.80859375, + "rewards/margins": 9.125, + "rewards/rejected": -5.32421875, + "step": 2769 + }, + { + "epoch": 0.5496304380177588, + "grad_norm": 26.737088467433768, + "learning_rate": 5.509349971408157e-07, + "logits/chosen": 4.45703125, + "logits/rejected": 4.703125, + "logps/chosen": -1063.0, + "logps/rejected": -1365.5, + "loss": 0.5079, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.603515625, + "rewards/margins": 7.828125, + "rewards/rejected": -5.220703125, + "step": 2770 + }, + { + "epoch": 0.5498288605585595, + "grad_norm": 32.718610457690716, + "learning_rate": 5.50623331676378e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.2578125, + "logps/chosen": -962.0, + "logps/rejected": -753.0, + "loss": 0.3525, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.400390625, + "rewards/margins": 6.9609375, + "rewards/rejected": -4.55078125, + "step": 2771 + }, + { + "epoch": 0.5500272830993601, + "grad_norm": 29.82173312750831, + "learning_rate": 5.503116659129393e-07, + "logits/chosen": 3.75390625, + "logits/rejected": 3.46875, + "logps/chosen": -776.0, + "logps/rejected": -544.5, + "loss": 0.3286, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.296875, + "rewards/margins": 6.8046875, + "rewards/rejected": -4.50390625, + "step": 2772 + }, + { + "epoch": 0.5502257056401607, + "grad_norm": 28.3991898123596, + "learning_rate": 5.5e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 3.71875, + "logps/chosen": -1364.0, + "logps/rejected": -829.0, + "loss": 0.3316, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.55078125, + "rewards/margins": 7.53125, + "rewards/rejected": -4.970703125, + "step": 2773 + }, + { + "epoch": 0.5504241281809613, + "grad_norm": 40.3738736653673, + "learning_rate": 5.496883340870606e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 3.8671875, + "logps/chosen": -836.0, + "logps/rejected": -1406.0, + "loss": 0.4555, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1171875, + "rewards/margins": 7.4140625, + "rewards/rejected": -5.2890625, + "step": 2774 + }, + { + "epoch": 0.550622550721762, + "grad_norm": 35.46985115112191, + "learning_rate": 5.493766683236219e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 4.29296875, + "logps/chosen": -1074.0, + "logps/rejected": -792.0, + "loss": 0.394, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4140625, + "rewards/margins": 6.3515625, + "rewards/rejected": -3.9296875, + "step": 2775 + }, + { + "epoch": 0.5508209732625626, + "grad_norm": 29.816666328018037, + "learning_rate": 5.490650028591842e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 3.91015625, + "logps/chosen": -1036.0, + "logps/rejected": -788.0, + "loss": 0.3544, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6865234375, + "rewards/margins": 9.65625, + "rewards/rejected": -6.96875, + "step": 2776 + }, + { + "epoch": 0.5510193958033632, + "grad_norm": 28.268235738270175, + "learning_rate": 5.487533378432482e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 3.8125, + "logps/chosen": -697.5, + "logps/rejected": -567.0, + "loss": 0.4788, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.123046875, + "rewards/margins": 5.2578125, + "rewards/rejected": -3.12890625, + "step": 2777 + }, + { + "epoch": 0.551217818344164, + "grad_norm": 31.136443018191795, + "learning_rate": 5.484416734253138e-07, + "logits/chosen": 3.62109375, + "logits/rejected": 3.6484375, + "logps/chosen": -846.0, + "logps/rejected": -738.5, + "loss": 0.2992, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.11328125, + "rewards/margins": 7.8828125, + "rewards/rejected": -5.7734375, + "step": 2778 + }, + { + "epoch": 0.5514162408849645, + "grad_norm": 30.8492172299215, + "learning_rate": 5.48130009754881e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.66015625, + "logps/chosen": -1075.0, + "logps/rejected": -902.0, + "loss": 0.3522, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6484375, + "rewards/margins": 9.546875, + "rewards/rejected": -6.875, + "step": 2779 + }, + { + "epoch": 0.5516146634257652, + "grad_norm": 22.63809223696341, + "learning_rate": 5.478183469814493e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.1015625, + "logps/chosen": -801.5, + "logps/rejected": -1498.0, + "loss": 0.348, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.04296875, + "rewards/margins": 12.0546875, + "rewards/rejected": -9.01953125, + "step": 2780 + }, + { + "epoch": 0.5518130859665658, + "grad_norm": 42.16395580539006, + "learning_rate": 5.475066852545176e-07, + "logits/chosen": 4.484375, + "logits/rejected": 4.59375, + "logps/chosen": -1112.5, + "logps/rejected": -786.0, + "loss": 0.4546, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.61767578125, + "rewards/margins": 7.48046875, + "rewards/rejected": -5.8671875, + "step": 2781 + }, + { + "epoch": 0.5520115085073665, + "grad_norm": 39.1084348557229, + "learning_rate": 5.47195024723585e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 4.25, + "logps/chosen": -1130.0, + "logps/rejected": -750.5, + "loss": 0.4318, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5078125, + "rewards/margins": 7.609375, + "rewards/rejected": -5.109375, + "step": 2782 + }, + { + "epoch": 0.5522099310481671, + "grad_norm": 29.344546739199625, + "learning_rate": 5.46883365538149e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 3.6875, + "logps/chosen": -831.5, + "logps/rejected": -647.5, + "loss": 0.3687, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.2265625, + "rewards/margins": 7.609375, + "rewards/rejected": -5.390625, + "step": 2783 + }, + { + "epoch": 0.5524083535889677, + "grad_norm": 35.995823129737374, + "learning_rate": 5.465717078477073e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 3.86328125, + "logps/chosen": -1238.0, + "logps/rejected": -840.0, + "loss": 0.4118, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.794921875, + "rewards/margins": 8.4921875, + "rewards/rejected": -6.703125, + "step": 2784 + }, + { + "epoch": 0.5526067761297684, + "grad_norm": 27.812624524423075, + "learning_rate": 5.462600518017564e-07, + "logits/chosen": 4.6171875, + "logits/rejected": 4.578125, + "logps/chosen": -865.0, + "logps/rejected": -760.0, + "loss": 0.4231, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6015625, + "rewards/margins": 7.671875, + "rewards/rejected": -5.07421875, + "step": 2785 + }, + { + "epoch": 0.552805198670569, + "grad_norm": 26.325344265282165, + "learning_rate": 5.459483975497922e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.1640625, + "logps/chosen": -1069.0, + "logps/rejected": -1254.0, + "loss": 0.2959, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.2578125, + "rewards/margins": 10.4296875, + "rewards/rejected": -7.19140625, + "step": 2786 + }, + { + "epoch": 0.5530036212113696, + "grad_norm": 39.0241072560135, + "learning_rate": 5.456367452413096e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 4.0390625, + "logps/chosen": -963.0, + "logps/rejected": -780.0, + "loss": 0.3262, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.162109375, + "rewards/margins": 9.140625, + "rewards/rejected": -5.984375, + "step": 2787 + }, + { + "epoch": 0.5532020437521702, + "grad_norm": 31.77337438233275, + "learning_rate": 5.453250950258031e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 3.9375, + "logps/chosen": -948.5, + "logps/rejected": -691.0, + "loss": 0.3757, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.431640625, + "rewards/margins": 8.203125, + "rewards/rejected": -5.75, + "step": 2788 + }, + { + "epoch": 0.5534004662929709, + "grad_norm": 30.01830851267911, + "learning_rate": 5.450134470527653e-07, + "logits/chosen": 4.015625, + "logits/rejected": 3.8984375, + "logps/chosen": -875.0, + "logps/rejected": -1011.0, + "loss": 0.4009, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.294921875, + "rewards/margins": 8.984375, + "rewards/rejected": -6.69921875, + "step": 2789 + }, + { + "epoch": 0.5535988888337715, + "grad_norm": 33.65534209540577, + "learning_rate": 5.447018014716883e-07, + "logits/chosen": 3.71484375, + "logits/rejected": 3.55859375, + "logps/chosen": -935.0, + "logps/rejected": -700.0, + "loss": 0.4091, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.12890625, + "rewards/margins": 7.1328125, + "rewards/rejected": -5.0078125, + "step": 2790 + }, + { + "epoch": 0.5537973113745721, + "grad_norm": 32.723712264975156, + "learning_rate": 5.443901584320633e-07, + "logits/chosen": 4.6875, + "logits/rejected": 4.40625, + "logps/chosen": -1162.0, + "logps/rejected": -1007.0, + "loss": 0.3975, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.04296875, + "rewards/margins": 9.16015625, + "rewards/rejected": -6.09765625, + "step": 2791 + }, + { + "epoch": 0.5539957339153728, + "grad_norm": 40.05915662572349, + "learning_rate": 5.440785180833794e-07, + "logits/chosen": 3.40625, + "logits/rejected": 3.546875, + "logps/chosen": -959.0, + "logps/rejected": -1039.0, + "loss": 0.4684, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.47265625, + "rewards/margins": 7.8984375, + "rewards/rejected": -5.421875, + "step": 2792 + }, + { + "epoch": 0.5541941564561734, + "grad_norm": 39.4849328137115, + "learning_rate": 5.437668805751253e-07, + "logits/chosen": 4.125, + "logits/rejected": 3.99609375, + "logps/chosen": -928.0, + "logps/rejected": -675.0, + "loss": 0.4869, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8154296875, + "rewards/margins": 6.0703125, + "rewards/rejected": -4.25390625, + "step": 2793 + }, + { + "epoch": 0.554392578996974, + "grad_norm": 41.4400625138826, + "learning_rate": 5.434552460567877e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 4.03515625, + "logps/chosen": -996.0, + "logps/rejected": -812.5, + "loss": 0.5051, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.314453125, + "rewards/margins": 6.203125, + "rewards/rejected": -3.888671875, + "step": 2794 + }, + { + "epoch": 0.5545910015377747, + "grad_norm": 31.662117203168066, + "learning_rate": 5.431436146778527e-07, + "logits/chosen": 3.6875, + "logits/rejected": 3.671875, + "logps/chosen": -879.0, + "logps/rejected": -1463.0, + "loss": 0.462, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.05859375, + "rewards/margins": 8.9609375, + "rewards/rejected": -6.9140625, + "step": 2795 + }, + { + "epoch": 0.5547894240785753, + "grad_norm": 36.44965156307726, + "learning_rate": 5.428319865878036e-07, + "logits/chosen": 4.4453125, + "logits/rejected": 4.44140625, + "logps/chosen": -1767.0, + "logps/rejected": -1289.5, + "loss": 0.4348, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.546875, + "rewards/margins": 7.1875, + "rewards/rejected": -6.654296875, + "step": 2796 + }, + { + "epoch": 0.554987846619376, + "grad_norm": 27.12716880723758, + "learning_rate": 5.425203619361233e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.28125, + "logps/chosen": -935.0, + "logps/rejected": -764.5, + "loss": 0.2807, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.83203125, + "rewards/margins": 9.2578125, + "rewards/rejected": -6.4375, + "step": 2797 + }, + { + "epoch": 0.5551862691601765, + "grad_norm": 31.895289930488993, + "learning_rate": 5.422087408722925e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 3.69921875, + "logps/chosen": -1299.0, + "logps/rejected": -926.0, + "loss": 0.2477, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.21484375, + "rewards/margins": 10.875, + "rewards/rejected": -7.671875, + "step": 2798 + }, + { + "epoch": 0.5553846917009773, + "grad_norm": 30.181856974933545, + "learning_rate": 5.418971235457902e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 3.81640625, + "logps/chosen": -1071.0, + "logps/rejected": -853.0, + "loss": 0.4685, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.892578125, + "rewards/margins": 7.2265625, + "rewards/rejected": -4.33984375, + "step": 2799 + }, + { + "epoch": 0.5555831142417779, + "grad_norm": 28.636484638883953, + "learning_rate": 5.415855101060941e-07, + "logits/chosen": 4.4453125, + "logits/rejected": 4.40625, + "logps/chosen": -838.0, + "logps/rejected": -638.0, + "loss": 0.4068, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.82421875, + "rewards/margins": 6.625, + "rewards/rejected": -3.81640625, + "step": 2800 + }, + { + "epoch": 0.5557815367825785, + "grad_norm": 31.736461447405667, + "learning_rate": 5.41273900702679e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.02734375, + "logps/chosen": -1434.0, + "logps/rejected": -796.0, + "loss": 0.2989, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.89453125, + "rewards/margins": 7.4140625, + "rewards/rejected": -4.515625, + "step": 2801 + }, + { + "epoch": 0.5559799593233792, + "grad_norm": 30.20013722369092, + "learning_rate": 5.40962295485019e-07, + "logits/chosen": 4.4140625, + "logits/rejected": 4.140625, + "logps/chosen": -1263.5, + "logps/rejected": -969.0, + "loss": 0.2878, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.318359375, + "rewards/margins": 9.8984375, + "rewards/rejected": -6.5859375, + "step": 2802 + }, + { + "epoch": 0.5561783818641798, + "grad_norm": 31.45873562400325, + "learning_rate": 5.406506946025851e-07, + "logits/chosen": 3.984375, + "logits/rejected": 4.07421875, + "logps/chosen": -923.0, + "logps/rejected": -650.0, + "loss": 0.4773, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.54296875, + "rewards/margins": 6.6796875, + "rewards/rejected": -4.1484375, + "step": 2803 + }, + { + "epoch": 0.5563768044049804, + "grad_norm": 36.66895533713544, + "learning_rate": 5.403390982048472e-07, + "logits/chosen": 3.75390625, + "logits/rejected": 3.65625, + "logps/chosen": -1065.0, + "logps/rejected": -822.0, + "loss": 0.3609, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.009765625, + "rewards/margins": 7.7890625, + "rewards/rejected": -4.77734375, + "step": 2804 + }, + { + "epoch": 0.556575226945781, + "grad_norm": 28.80221229272608, + "learning_rate": 5.400275064412719e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 3.46875, + "logps/chosen": -940.0, + "logps/rejected": -560.0, + "loss": 0.3006, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6923828125, + "rewards/margins": 8.09765625, + "rewards/rejected": -5.41015625, + "step": 2805 + }, + { + "epoch": 0.5567736494865817, + "grad_norm": 38.44873667720716, + "learning_rate": 5.397159194613249e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.203125, + "logps/chosen": -880.0, + "logps/rejected": -584.0, + "loss": 0.241, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6015625, + "rewards/margins": 8.5859375, + "rewards/rejected": -5.9765625, + "step": 2806 + }, + { + "epoch": 0.5569720720273823, + "grad_norm": 32.92772569904432, + "learning_rate": 5.394043374144686e-07, + "logits/chosen": 4.53515625, + "logits/rejected": 4.2265625, + "logps/chosen": -941.0, + "logps/rejected": -659.5, + "loss": 0.4337, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.73046875, + "rewards/margins": 6.5, + "rewards/rejected": -3.76171875, + "step": 2807 + }, + { + "epoch": 0.5571704945681829, + "grad_norm": 29.93235167271004, + "learning_rate": 5.390927604501632e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.34765625, + "logps/chosen": -1078.0, + "logps/rejected": -796.0, + "loss": 0.3662, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.1650390625, + "rewards/margins": 7.640625, + "rewards/rejected": -5.46875, + "step": 2808 + }, + { + "epoch": 0.5573689171089836, + "grad_norm": 38.12666811639459, + "learning_rate": 5.387811887178673e-07, + "logits/chosen": 3.546875, + "logits/rejected": 3.54296875, + "logps/chosen": -1033.0, + "logps/rejected": -748.0, + "loss": 0.3467, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.98828125, + "rewards/margins": 8.03125, + "rewards/rejected": -5.0390625, + "step": 2809 + }, + { + "epoch": 0.5575673396497842, + "grad_norm": 35.2155994294035, + "learning_rate": 5.384696223670355e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 4.05078125, + "logps/chosen": -868.0, + "logps/rejected": -752.5, + "loss": 0.4474, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.83203125, + "rewards/margins": 6.640625, + "rewards/rejected": -4.8046875, + "step": 2810 + }, + { + "epoch": 0.5577657621905848, + "grad_norm": 38.5395462357415, + "learning_rate": 5.381580615471212e-07, + "logits/chosen": 3.87109375, + "logits/rejected": 3.93359375, + "logps/chosen": -817.0, + "logps/rejected": -658.5, + "loss": 0.3738, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.08544921875, + "rewards/margins": 8.078125, + "rewards/rejected": -5.99609375, + "step": 2811 + }, + { + "epoch": 0.5579641847313855, + "grad_norm": 31.182633727874936, + "learning_rate": 5.378465064075745e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.20703125, + "logps/chosen": -994.0, + "logps/rejected": -751.0, + "loss": 0.4238, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4296875, + "rewards/margins": 7.6953125, + "rewards/rejected": -5.265625, + "step": 2812 + }, + { + "epoch": 0.5581626072721861, + "grad_norm": 31.44380329121959, + "learning_rate": 5.375349570978426e-07, + "logits/chosen": 4.3984375, + "logits/rejected": 4.40625, + "logps/chosen": -989.0, + "logps/rejected": -720.0, + "loss": 0.4217, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.310546875, + "rewards/margins": 7.16796875, + "rewards/rejected": -4.8359375, + "step": 2813 + }, + { + "epoch": 0.5583610298129867, + "grad_norm": 28.962430230454462, + "learning_rate": 5.372234137673706e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 4.0703125, + "logps/chosen": -1304.0, + "logps/rejected": -955.5, + "loss": 0.4496, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.3330078125, + "rewards/margins": 9.1875, + "rewards/rejected": -6.8515625, + "step": 2814 + }, + { + "epoch": 0.5585594523537873, + "grad_norm": 30.792607805755697, + "learning_rate": 5.369118765655998e-07, + "logits/chosen": 4.25, + "logits/rejected": 4.30078125, + "logps/chosen": -1202.5, + "logps/rejected": -748.0, + "loss": 0.3087, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.837890625, + "rewards/margins": 9.0859375, + "rewards/rejected": -6.2578125, + "step": 2815 + }, + { + "epoch": 0.5587578748945881, + "grad_norm": 28.292061008740994, + "learning_rate": 5.366003456419695e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 4.046875, + "logps/chosen": -907.0, + "logps/rejected": -1771.5, + "loss": 0.3518, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.171875, + "rewards/margins": 12.1171875, + "rewards/rejected": -9.9296875, + "step": 2816 + }, + { + "epoch": 0.5589562974353887, + "grad_norm": 38.44825873001276, + "learning_rate": 5.362888211459152e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.9765625, + "logps/chosen": -733.5, + "logps/rejected": -979.0, + "loss": 0.5113, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.154296875, + "rewards/margins": 7.84375, + "rewards/rejected": -6.6953125, + "step": 2817 + }, + { + "epoch": 0.5591547199761893, + "grad_norm": 30.775309011322932, + "learning_rate": 5.359773032268702e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.0546875, + "logps/chosen": -1084.0, + "logps/rejected": -642.5, + "loss": 0.4424, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.005859375, + "rewards/margins": 7.1328125, + "rewards/rejected": -5.1171875, + "step": 2818 + }, + { + "epoch": 0.55935314251699, + "grad_norm": 31.634318661101616, + "learning_rate": 5.356657920342634e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 4.01953125, + "logps/chosen": -839.0, + "logps/rejected": -874.5, + "loss": 0.5132, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.6064453125, + "rewards/margins": 5.90625, + "rewards/rejected": -4.30078125, + "step": 2819 + }, + { + "epoch": 0.5595515650577906, + "grad_norm": 40.46361843433206, + "learning_rate": 5.353542877175218e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.26953125, + "logps/chosen": -787.5, + "logps/rejected": -615.0, + "loss": 0.4315, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.169921875, + "rewards/margins": 7.4765625, + "rewards/rejected": -5.3046875, + "step": 2820 + }, + { + "epoch": 0.5597499875985912, + "grad_norm": 28.98104534243202, + "learning_rate": 5.35042790426068e-07, + "logits/chosen": 4.26171875, + "logits/rejected": 3.984375, + "logps/chosen": -631.0, + "logps/rejected": -385.5, + "loss": 0.4948, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.3173828125, + "rewards/margins": 5.90625, + "rewards/rejected": -4.5859375, + "step": 2821 + }, + { + "epoch": 0.5599484101393918, + "grad_norm": 31.786828041750436, + "learning_rate": 5.34731300309322e-07, + "logits/chosen": 4.05078125, + "logits/rejected": 3.95703125, + "logps/chosen": -1024.0, + "logps/rejected": -635.0, + "loss": 0.3528, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.474609375, + "rewards/margins": 8.6796875, + "rewards/rejected": -6.2109375, + "step": 2822 + }, + { + "epoch": 0.5601468326801925, + "grad_norm": 26.32846204380593, + "learning_rate": 5.344198175167001e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.10546875, + "logps/chosen": -1310.0, + "logps/rejected": -764.0, + "loss": 0.5872, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.861328125, + "rewards/margins": 7.47265625, + "rewards/rejected": -5.60546875, + "step": 2823 + }, + { + "epoch": 0.5603452552209931, + "grad_norm": 32.005019612089605, + "learning_rate": 5.341083421976148e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.25390625, + "logps/chosen": -1050.0, + "logps/rejected": -755.5, + "loss": 0.3606, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.98046875, + "rewards/margins": 8.4296875, + "rewards/rejected": -5.4296875, + "step": 2824 + }, + { + "epoch": 0.5605436777617937, + "grad_norm": 35.85920085890435, + "learning_rate": 5.337968745014757e-07, + "logits/chosen": 3.62109375, + "logits/rejected": 3.74609375, + "logps/chosen": -869.0, + "logps/rejected": -832.0, + "loss": 0.3259, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.26171875, + "rewards/margins": 10.7578125, + "rewards/rejected": -7.51171875, + "step": 2825 + }, + { + "epoch": 0.5607421003025944, + "grad_norm": 36.4770031488888, + "learning_rate": 5.334854145776879e-07, + "logits/chosen": 4.328125, + "logits/rejected": 4.015625, + "logps/chosen": -974.0, + "logps/rejected": -750.5, + "loss": 0.5136, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.021484375, + "rewards/margins": 7.03125, + "rewards/rejected": -5.00390625, + "step": 2826 + }, + { + "epoch": 0.560940522843395, + "grad_norm": 35.40681682213776, + "learning_rate": 5.331739625756535e-07, + "logits/chosen": 3.6953125, + "logits/rejected": 3.86328125, + "logps/chosen": -931.0, + "logps/rejected": -1675.0, + "loss": 0.3953, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5078125, + "rewards/margins": 11.2734375, + "rewards/rejected": -8.78125, + "step": 2827 + }, + { + "epoch": 0.5611389453841956, + "grad_norm": 37.028672109598475, + "learning_rate": 5.328625186447703e-07, + "logits/chosen": 4.35546875, + "logits/rejected": 4.3515625, + "logps/chosen": -830.5, + "logps/rejected": -678.0, + "loss": 0.6113, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.8134765625, + "rewards/margins": 4.421875, + "rewards/rejected": -3.6025390625, + "step": 2828 + }, + { + "epoch": 0.5613373679249963, + "grad_norm": 40.20839131951292, + "learning_rate": 5.325510829344324e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.2421875, + "logps/chosen": -842.0, + "logps/rejected": -718.5, + "loss": 0.5841, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.3603515625, + "rewards/margins": 6.5703125, + "rewards/rejected": -5.2109375, + "step": 2829 + }, + { + "epoch": 0.5615357904657969, + "grad_norm": 27.846671060977364, + "learning_rate": 5.322396555940303e-07, + "logits/chosen": 4.36328125, + "logits/rejected": 3.96875, + "logps/chosen": -827.0, + "logps/rejected": -506.0, + "loss": 0.3836, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.5625, + "rewards/margins": 7.71875, + "rewards/rejected": -4.14453125, + "step": 2830 + }, + { + "epoch": 0.5617342130065975, + "grad_norm": 29.488598861705242, + "learning_rate": 5.319282367729497e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.203125, + "logps/chosen": -1054.0, + "logps/rejected": -1125.0, + "loss": 0.435, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.65673828125, + "rewards/margins": 9.0859375, + "rewards/rejected": -7.4140625, + "step": 2831 + }, + { + "epoch": 0.5619326355473981, + "grad_norm": 37.931197359037775, + "learning_rate": 5.31616826620573e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 3.97265625, + "logps/chosen": -1206.0, + "logps/rejected": -979.0, + "loss": 0.3281, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.22265625, + "rewards/margins": 8.421875, + "rewards/rejected": -5.19140625, + "step": 2832 + }, + { + "epoch": 0.5621310580881989, + "grad_norm": 32.30243414218203, + "learning_rate": 5.31305425286278e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 4.1875, + "logps/chosen": -770.0, + "logps/rejected": -771.0, + "loss": 0.4805, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.30859375, + "rewards/margins": 6.2890625, + "rewards/rejected": -3.98828125, + "step": 2833 + }, + { + "epoch": 0.5623294806289995, + "grad_norm": 34.64284484756739, + "learning_rate": 5.309940329194385e-07, + "logits/chosen": 3.61328125, + "logits/rejected": 3.7109375, + "logps/chosen": -1089.5, + "logps/rejected": -735.0, + "loss": 0.3364, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.880859375, + "rewards/margins": 8.3984375, + "rewards/rejected": -6.5390625, + "step": 2834 + }, + { + "epoch": 0.5625279031698001, + "grad_norm": 36.0554081256751, + "learning_rate": 5.306826496694238e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 4.09765625, + "logps/chosen": -883.0, + "logps/rejected": -617.0, + "loss": 0.4523, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.8115234375, + "rewards/margins": 6.546875, + "rewards/rejected": -4.71875, + "step": 2835 + }, + { + "epoch": 0.5627263257106008, + "grad_norm": 31.9887592091149, + "learning_rate": 5.303712756855988e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 3.71484375, + "logps/chosen": -1197.0, + "logps/rejected": -657.0, + "loss": 0.3385, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.2021484375, + "rewards/margins": 7.765625, + "rewards/rejected": -5.578125, + "step": 2836 + }, + { + "epoch": 0.5629247482514014, + "grad_norm": 32.80828863876562, + "learning_rate": 5.300599111173243e-07, + "logits/chosen": 3.57421875, + "logits/rejected": 3.609375, + "logps/chosen": -897.0, + "logps/rejected": -1605.0, + "loss": 0.3982, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5595703125, + "rewards/margins": 11.625, + "rewards/rejected": -9.0390625, + "step": 2837 + }, + { + "epoch": 0.563123170792202, + "grad_norm": 28.452103494204543, + "learning_rate": 5.297485561139559e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.1640625, + "logps/chosen": -1111.0, + "logps/rejected": -790.5, + "loss": 0.3776, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.44140625, + "rewards/margins": 8.546875, + "rewards/rejected": -6.1171875, + "step": 2838 + }, + { + "epoch": 0.5633215933330026, + "grad_norm": 27.816302481550306, + "learning_rate": 5.294372108248459e-07, + "logits/chosen": 3.9375, + "logits/rejected": 4.1640625, + "logps/chosen": -1053.0, + "logps/rejected": -1790.0, + "loss": 0.3091, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.25, + "rewards/margins": 12.9453125, + "rewards/rejected": -9.67578125, + "step": 2839 + }, + { + "epoch": 0.5635200158738033, + "grad_norm": 42.27474796727771, + "learning_rate": 5.291258753993403e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.12109375, + "logps/chosen": -1361.0, + "logps/rejected": -892.0, + "loss": 0.3745, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1171875, + "rewards/margins": 7.03125, + "rewards/rejected": -3.91015625, + "step": 2840 + }, + { + "epoch": 0.5637184384146039, + "grad_norm": 34.07533708857294, + "learning_rate": 5.288145499867817e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 3.671875, + "logps/chosen": -975.0, + "logps/rejected": -636.0, + "loss": 0.3315, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.15234375, + "rewards/margins": 6.953125, + "rewards/rejected": -3.79296875, + "step": 2841 + }, + { + "epoch": 0.5639168609554045, + "grad_norm": 42.38116874630739, + "learning_rate": 5.285032347365069e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 4.21875, + "logps/chosen": -991.0, + "logps/rejected": -1582.0, + "loss": 0.6051, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8671875, + "rewards/margins": 8.19921875, + "rewards/rejected": -6.32666015625, + "step": 2842 + }, + { + "epoch": 0.5641152834962052, + "grad_norm": 50.45436086785118, + "learning_rate": 5.281919297978487e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.171875, + "logps/chosen": -1160.0, + "logps/rejected": -958.0, + "loss": 0.3389, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.39453125, + "rewards/margins": 9.07421875, + "rewards/rejected": -5.681640625, + "step": 2843 + }, + { + "epoch": 0.5643137060370058, + "grad_norm": 29.90282663431855, + "learning_rate": 5.278806353201343e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 3.40625, + "logps/chosen": -1264.0, + "logps/rejected": -692.0, + "loss": 0.2791, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8046875, + "rewards/margins": 7.734375, + "rewards/rejected": -4.9375, + "step": 2844 + }, + { + "epoch": 0.5645121285778064, + "grad_norm": 33.16255565122866, + "learning_rate": 5.275693514526862e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.13671875, + "logps/chosen": -842.0, + "logps/rejected": -712.5, + "loss": 0.3484, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.61328125, + "rewards/margins": 6.96875, + "rewards/rejected": -4.3515625, + "step": 2845 + }, + { + "epoch": 0.5647105511186071, + "grad_norm": 23.961004742445756, + "learning_rate": 5.272580783448219e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 4.11328125, + "logps/chosen": -1081.0, + "logps/rejected": -720.0, + "loss": 0.3947, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.662109375, + "rewards/margins": 6.921875, + "rewards/rejected": -4.25390625, + "step": 2846 + }, + { + "epoch": 0.5649089736594077, + "grad_norm": 24.33510033202512, + "learning_rate": 5.269468161458533e-07, + "logits/chosen": 3.7109375, + "logits/rejected": 3.89453125, + "logps/chosen": -994.5, + "logps/rejected": -731.5, + "loss": 0.4361, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.099609375, + "rewards/margins": 6.65625, + "rewards/rejected": -3.56640625, + "step": 2847 + }, + { + "epoch": 0.5651073962002083, + "grad_norm": 57.261061520484304, + "learning_rate": 5.266355650050877e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.47265625, + "logps/chosen": -980.0, + "logps/rejected": -726.0, + "loss": 0.4524, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.7158203125, + "rewards/margins": 7.125, + "rewards/rejected": -5.41015625, + "step": 2848 + }, + { + "epoch": 0.5653058187410089, + "grad_norm": 28.823889136885292, + "learning_rate": 5.263243250718264e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 3.61328125, + "logps/chosen": -1074.0, + "logps/rejected": -920.0, + "loss": 0.1905, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.28125, + "rewards/margins": 9.6796875, + "rewards/rejected": -6.3984375, + "step": 2849 + }, + { + "epoch": 0.5655042412818096, + "grad_norm": 32.88849678298894, + "learning_rate": 5.260130964953658e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.015625, + "logps/chosen": -1086.0, + "logps/rejected": -821.5, + "loss": 0.4915, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.375, + "rewards/margins": 6.4921875, + "rewards/rejected": -4.109375, + "step": 2850 + }, + { + "epoch": 0.5657026638226103, + "grad_norm": 33.68219634509032, + "learning_rate": 5.257018794249968e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 3.8984375, + "logps/chosen": -1110.0, + "logps/rejected": -708.5, + "loss": 0.3201, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.982421875, + "rewards/margins": 8.5078125, + "rewards/rejected": -5.52734375, + "step": 2851 + }, + { + "epoch": 0.5659010863634109, + "grad_norm": 40.766016335005666, + "learning_rate": 5.253906740100045e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.46875, + "logps/chosen": -727.0, + "logps/rejected": -1285.0, + "loss": 0.4695, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.5986328125, + "rewards/margins": 7.88671875, + "rewards/rejected": -6.283203125, + "step": 2852 + }, + { + "epoch": 0.5660995089042116, + "grad_norm": 35.7518945643131, + "learning_rate": 5.250794803996687e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 3.703125, + "logps/chosen": -768.5, + "logps/rejected": -637.5, + "loss": 0.5736, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.318359375, + "rewards/margins": 5.03125, + "rewards/rejected": -3.705078125, + "step": 2853 + }, + { + "epoch": 0.5662979314450122, + "grad_norm": 29.572905940733012, + "learning_rate": 5.247682987432637e-07, + "logits/chosen": 3.78515625, + "logits/rejected": 4.01953125, + "logps/chosen": -1047.0, + "logps/rejected": -1684.0, + "loss": 0.414, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3427734375, + "rewards/margins": 9.734375, + "rewards/rejected": -7.3828125, + "step": 2854 + }, + { + "epoch": 0.5664963539858128, + "grad_norm": 30.135145143222196, + "learning_rate": 5.244571291900575e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 3.84765625, + "logps/chosen": -916.5, + "logps/rejected": -709.0, + "loss": 0.6284, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.57177734375, + "rewards/margins": 5.48828125, + "rewards/rejected": -3.92578125, + "step": 2855 + }, + { + "epoch": 0.5666947765266134, + "grad_norm": 32.504716298593294, + "learning_rate": 5.241459718893127e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 4.01953125, + "logps/chosen": -1090.0, + "logps/rejected": -1571.0, + "loss": 0.4227, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.78515625, + "rewards/margins": 9.859375, + "rewards/rejected": -8.0625, + "step": 2856 + }, + { + "epoch": 0.5668931990674141, + "grad_norm": 28.23170003962611, + "learning_rate": 5.238348269902859e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.24609375, + "logps/chosen": -1052.0, + "logps/rejected": -952.0, + "loss": 0.3751, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.08984375, + "rewards/margins": 9.8671875, + "rewards/rejected": -6.77734375, + "step": 2857 + }, + { + "epoch": 0.5670916216082147, + "grad_norm": 33.325812009828965, + "learning_rate": 5.235236946422278e-07, + "logits/chosen": 3.6640625, + "logits/rejected": 3.7734375, + "logps/chosen": -1042.0, + "logps/rejected": -1023.0, + "loss": 0.4658, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.015625, + "rewards/margins": 9.0859375, + "rewards/rejected": -6.0625, + "step": 2858 + }, + { + "epoch": 0.5672900441490153, + "grad_norm": 22.786548752050543, + "learning_rate": 5.232125749943832e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 3.734375, + "logps/chosen": -1209.0, + "logps/rejected": -803.0, + "loss": 0.3988, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4296875, + "rewards/margins": 9.3828125, + "rewards/rejected": -6.93359375, + "step": 2859 + }, + { + "epoch": 0.567488466689816, + "grad_norm": 28.07583933798375, + "learning_rate": 5.229014681959903e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 3.88671875, + "logps/chosen": -1034.5, + "logps/rejected": -737.5, + "loss": 0.3629, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.3232421875, + "rewards/margins": 10.203125, + "rewards/rejected": -7.875, + "step": 2860 + }, + { + "epoch": 0.5676868892306166, + "grad_norm": 35.88169934710252, + "learning_rate": 5.225903743962819e-07, + "logits/chosen": 4.5390625, + "logits/rejected": 4.1875, + "logps/chosen": -848.0, + "logps/rejected": -554.5, + "loss": 0.3499, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.751953125, + "rewards/margins": 7.3671875, + "rewards/rejected": -4.609375, + "step": 2861 + }, + { + "epoch": 0.5678853117714172, + "grad_norm": 32.06151406997534, + "learning_rate": 5.222792937444841e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.67578125, + "logps/chosen": -1032.0, + "logps/rejected": -622.5, + "loss": 0.2693, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.810546875, + "rewards/margins": 8.3671875, + "rewards/rejected": -5.55078125, + "step": 2862 + }, + { + "epoch": 0.5680837343122178, + "grad_norm": 35.95808508850466, + "learning_rate": 5.219682263898165e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.3046875, + "logps/chosen": -1151.0, + "logps/rejected": -1540.0, + "loss": 0.4367, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.115234375, + "rewards/margins": 9.3828125, + "rewards/rejected": -7.25, + "step": 2863 + }, + { + "epoch": 0.5682821568530185, + "grad_norm": 36.735947777658325, + "learning_rate": 5.21657172481493e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.34765625, + "logps/chosen": -774.0, + "logps/rejected": -995.0, + "loss": 0.4932, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.017578125, + "rewards/margins": 7.6328125, + "rewards/rejected": -5.6171875, + "step": 2864 + }, + { + "epoch": 0.5684805793938191, + "grad_norm": 24.265662699494502, + "learning_rate": 5.213461321687201e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.91796875, + "logps/chosen": -980.0, + "logps/rejected": -1384.0, + "loss": 0.2955, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.69140625, + "rewards/margins": 12.03125, + "rewards/rejected": -9.33984375, + "step": 2865 + }, + { + "epoch": 0.5686790019346197, + "grad_norm": 33.586325726997, + "learning_rate": 5.21035105600699e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.23046875, + "logps/chosen": -797.5, + "logps/rejected": -632.0, + "loss": 0.3937, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.91796875, + "rewards/margins": 5.92578125, + "rewards/rejected": -5.015625, + "step": 2866 + }, + { + "epoch": 0.5688774244754204, + "grad_norm": 28.55611119545489, + "learning_rate": 5.207240929266229e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 3.91796875, + "logps/chosen": -855.0, + "logps/rejected": -723.0, + "loss": 0.4122, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.640625, + "rewards/margins": 10.08984375, + "rewards/rejected": -7.44140625, + "step": 2867 + }, + { + "epoch": 0.569075847016221, + "grad_norm": 30.684559683950354, + "learning_rate": 5.204130942956795e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.28515625, + "logps/chosen": -971.0, + "logps/rejected": -680.0, + "loss": 0.2865, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.76953125, + "rewards/margins": 8.6484375, + "rewards/rejected": -5.875, + "step": 2868 + }, + { + "epoch": 0.5692742695570217, + "grad_norm": 40.65798463401735, + "learning_rate": 5.201021098570491e-07, + "logits/chosen": 3.921875, + "logits/rejected": 4.0078125, + "logps/chosen": -619.5, + "logps/rejected": -526.5, + "loss": 0.5987, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.966796875, + "rewards/margins": 4.44921875, + "rewards/rejected": -2.486328125, + "step": 2869 + }, + { + "epoch": 0.5694726920978224, + "grad_norm": 30.81532696028643, + "learning_rate": 5.197911397599056e-07, + "logits/chosen": 4.09375, + "logits/rejected": 4.109375, + "logps/chosen": -941.0, + "logps/rejected": -721.5, + "loss": 0.3726, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.38671875, + "rewards/margins": 8.9765625, + "rewards/rejected": -5.59375, + "step": 2870 + }, + { + "epoch": 0.569671114638623, + "grad_norm": 31.67216344562522, + "learning_rate": 5.194801841534158e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 3.94140625, + "logps/chosen": -897.0, + "logps/rejected": -604.0, + "loss": 0.4442, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.90625, + "rewards/margins": 6.72265625, + "rewards/rejected": -3.818359375, + "step": 2871 + }, + { + "epoch": 0.5698695371794236, + "grad_norm": 30.726079224465362, + "learning_rate": 5.191692431867395e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.859375, + "logps/chosen": -975.0, + "logps/rejected": -579.5, + "loss": 0.5038, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.845703125, + "rewards/margins": 5.5078125, + "rewards/rejected": -2.671875, + "step": 2872 + }, + { + "epoch": 0.5700679597202242, + "grad_norm": 32.07979676132703, + "learning_rate": 5.188583170090296e-07, + "logits/chosen": 3.921875, + "logits/rejected": 3.875, + "logps/chosen": -722.5, + "logps/rejected": -483.0, + "loss": 0.5516, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.83984375, + "rewards/margins": 4.8046875, + "rewards/rejected": -2.95703125, + "step": 2873 + }, + { + "epoch": 0.5702663822610249, + "grad_norm": 29.962564995222955, + "learning_rate": 5.185474057694317e-07, + "logits/chosen": 3.7734375, + "logits/rejected": 3.9296875, + "logps/chosen": -868.5, + "logps/rejected": -743.0, + "loss": 0.5088, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.515625, + "rewards/margins": 7.21484375, + "rewards/rejected": -4.701416015625, + "step": 2874 + }, + { + "epoch": 0.5704648048018255, + "grad_norm": 28.789572884093932, + "learning_rate": 5.182365096170848e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.140625, + "logps/chosen": -557.0, + "logps/rejected": -426.5, + "loss": 0.5914, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.79296875, + "rewards/margins": 4.046875, + "rewards/rejected": -2.251953125, + "step": 2875 + }, + { + "epoch": 0.5706632273426261, + "grad_norm": 38.894268839209126, + "learning_rate": 5.179256287011199e-07, + "logits/chosen": 4.7109375, + "logits/rejected": 4.484375, + "logps/chosen": -791.0, + "logps/rejected": -658.0, + "loss": 0.5215, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.736328125, + "rewards/margins": 4.765625, + "rewards/rejected": -3.0341796875, + "step": 2876 + }, + { + "epoch": 0.5708616498834268, + "grad_norm": 27.409440345084466, + "learning_rate": 5.176147631706612e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 3.640625, + "logps/chosen": -924.0, + "logps/rejected": -625.5, + "loss": 0.4152, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3544921875, + "rewards/margins": 6.40234375, + "rewards/rejected": -4.02734375, + "step": 2877 + }, + { + "epoch": 0.5710600724242274, + "grad_norm": 31.06556338722407, + "learning_rate": 5.173039131748253e-07, + "logits/chosen": 3.51953125, + "logits/rejected": 3.26953125, + "logps/chosen": -1175.0, + "logps/rejected": -678.0, + "loss": 0.3513, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.078125, + "rewards/margins": 8.03125, + "rewards/rejected": -4.94921875, + "step": 2878 + }, + { + "epoch": 0.571258494965028, + "grad_norm": 23.405254938981397, + "learning_rate": 5.169930788627213e-07, + "logits/chosen": 4.3984375, + "logits/rejected": 4.64453125, + "logps/chosen": -1294.0, + "logps/rejected": -1607.0, + "loss": 0.4253, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.560546875, + "rewards/margins": 9.6484375, + "rewards/rejected": -7.091796875, + "step": 2879 + }, + { + "epoch": 0.5714569175058286, + "grad_norm": 35.54045405559371, + "learning_rate": 5.166822603834513e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 4.203125, + "logps/chosen": -1010.0, + "logps/rejected": -903.0, + "loss": 0.3565, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.578125, + "rewards/margins": 18.6171875, + "rewards/rejected": -15.0, + "step": 2880 + }, + { + "epoch": 0.5716553400466293, + "grad_norm": 32.99725992279302, + "learning_rate": 5.163714578861091e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 4.40625, + "logps/chosen": -925.0, + "logps/rejected": -1173.0, + "loss": 0.3978, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.310546875, + "rewards/margins": 8.703125, + "rewards/rejected": -6.40234375, + "step": 2881 + }, + { + "epoch": 0.5718537625874299, + "grad_norm": 26.089885474211947, + "learning_rate": 5.160606715197813e-07, + "logits/chosen": 3.6796875, + "logits/rejected": 3.90625, + "logps/chosen": -860.0, + "logps/rejected": -749.0, + "loss": 0.2903, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.216796875, + "rewards/margins": 8.8828125, + "rewards/rejected": -5.671875, + "step": 2882 + }, + { + "epoch": 0.5720521851282305, + "grad_norm": 25.141437544518325, + "learning_rate": 5.157499014335463e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.15234375, + "logps/chosen": -964.5, + "logps/rejected": -702.5, + "loss": 0.3728, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.671875, + "rewards/margins": 7.6328125, + "rewards/rejected": -4.96875, + "step": 2883 + }, + { + "epoch": 0.5722506076690312, + "grad_norm": 34.361644508090535, + "learning_rate": 5.154391477764753e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 3.6796875, + "logps/chosen": -1173.0, + "logps/rejected": -852.0, + "loss": 0.4103, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8984375, + "rewards/margins": 8.21875, + "rewards/rejected": -5.32421875, + "step": 2884 + }, + { + "epoch": 0.5724490302098318, + "grad_norm": 28.58609055514169, + "learning_rate": 5.151284106976311e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 4.296875, + "logps/chosen": -798.5, + "logps/rejected": -1009.0, + "loss": 0.5661, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.42236328125, + "rewards/margins": 7.765625, + "rewards/rejected": -5.33203125, + "step": 2885 + }, + { + "epoch": 0.5726474527506324, + "grad_norm": 33.49641384026636, + "learning_rate": 5.148176903460688e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.01171875, + "logps/chosen": -726.0, + "logps/rejected": -1002.5, + "loss": 0.5694, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.501953125, + "rewards/margins": 6.63671875, + "rewards/rejected": -4.138671875, + "step": 2886 + }, + { + "epoch": 0.5728458752914332, + "grad_norm": 38.42231746585093, + "learning_rate": 5.145069868708358e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 4.078125, + "logps/chosen": -1112.0, + "logps/rejected": -787.0, + "loss": 0.325, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.439453125, + "rewards/margins": 7.515625, + "rewards/rejected": -5.0703125, + "step": 2887 + }, + { + "epoch": 0.5730442978322338, + "grad_norm": 29.708681762395134, + "learning_rate": 5.141963004209704e-07, + "logits/chosen": 3.859375, + "logits/rejected": 4.09375, + "logps/chosen": -621.0, + "logps/rejected": -795.0, + "loss": 0.406, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.138671875, + "rewards/margins": 7.265625, + "rewards/rejected": -5.125, + "step": 2888 + }, + { + "epoch": 0.5732427203730344, + "grad_norm": 29.737491479341223, + "learning_rate": 5.138856311455039e-07, + "logits/chosen": 4.03125, + "logits/rejected": 3.7734375, + "logps/chosen": -1015.0, + "logps/rejected": -651.5, + "loss": 0.3057, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.78515625, + "rewards/margins": 8.3203125, + "rewards/rejected": -5.5390625, + "step": 2889 + }, + { + "epoch": 0.573441142913835, + "grad_norm": 35.25399590883318, + "learning_rate": 5.135749791934585e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.1953125, + "logps/chosen": -1251.0, + "logps/rejected": -839.0, + "loss": 0.4487, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.50390625, + "rewards/margins": 7.453125, + "rewards/rejected": -4.9375, + "step": 2890 + }, + { + "epoch": 0.5736395654546357, + "grad_norm": 38.17885768720551, + "learning_rate": 5.132643447138487e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.06640625, + "logps/chosen": -1298.0, + "logps/rejected": -1181.0, + "loss": 0.4435, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.16796875, + "rewards/margins": 9.46484375, + "rewards/rejected": -6.3046875, + "step": 2891 + }, + { + "epoch": 0.5738379879954363, + "grad_norm": 31.95222070197486, + "learning_rate": 5.129537278556801e-07, + "logits/chosen": 4.3984375, + "logits/rejected": 4.3046875, + "logps/chosen": -1056.5, + "logps/rejected": -767.0, + "loss": 0.3567, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.930450439453125, + "rewards/margins": 8.935546875, + "rewards/rejected": -6.0, + "step": 2892 + }, + { + "epoch": 0.5740364105362369, + "grad_norm": 35.26593130357222, + "learning_rate": 5.126431287679502e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 3.921875, + "logps/chosen": -1019.5, + "logps/rejected": -697.0, + "loss": 0.3968, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.814453125, + "rewards/margins": 7.78125, + "rewards/rejected": -4.9453125, + "step": 2893 + }, + { + "epoch": 0.5742348330770376, + "grad_norm": 31.217756346941503, + "learning_rate": 5.123325475996478e-07, + "logits/chosen": 3.84375, + "logits/rejected": 4.00390625, + "logps/chosen": -1092.0, + "logps/rejected": -888.0, + "loss": 0.3933, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.99609375, + "rewards/margins": 9.515625, + "rewards/rejected": -6.5234375, + "step": 2894 + }, + { + "epoch": 0.5744332556178382, + "grad_norm": 32.19345695443251, + "learning_rate": 5.120219844997532e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 3.7890625, + "logps/chosen": -1132.0, + "logps/rejected": -1499.5, + "loss": 0.4236, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.44140625, + "rewards/margins": 9.22265625, + "rewards/rejected": -7.78515625, + "step": 2895 + }, + { + "epoch": 0.5746316781586388, + "grad_norm": 31.574675355217334, + "learning_rate": 5.117114396172382e-07, + "logits/chosen": 3.984375, + "logits/rejected": 3.59765625, + "logps/chosen": -1377.5, + "logps/rejected": -608.5, + "loss": 0.5089, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.052734375, + "rewards/margins": 4.08984375, + "rewards/rejected": -4.03125, + "step": 2896 + }, + { + "epoch": 0.5748301006994394, + "grad_norm": 28.86519520713883, + "learning_rate": 5.114009131010652e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.078125, + "logps/chosen": -1267.0, + "logps/rejected": -907.0, + "loss": 0.4419, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.59765625, + "rewards/margins": 8.1796875, + "rewards/rejected": -5.583984375, + "step": 2897 + }, + { + "epoch": 0.5750285232402401, + "grad_norm": 37.210754464262045, + "learning_rate": 5.110904051001888e-07, + "logits/chosen": 4.078125, + "logits/rejected": 3.765625, + "logps/chosen": -883.0, + "logps/rejected": -676.0, + "loss": 0.376, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.703125, + "rewards/margins": 7.8828125, + "rewards/rejected": -6.1796875, + "step": 2898 + }, + { + "epoch": 0.5752269457810407, + "grad_norm": 27.977500604580335, + "learning_rate": 5.107799157635538e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 3.96484375, + "logps/chosen": -1013.0, + "logps/rejected": -1045.0, + "loss": 0.3293, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.21240234375, + "rewards/margins": 9.859375, + "rewards/rejected": -7.65234375, + "step": 2899 + }, + { + "epoch": 0.5754253683218413, + "grad_norm": 32.52371400525322, + "learning_rate": 5.104694452400966e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 4.21484375, + "logps/chosen": -942.0, + "logps/rejected": -805.5, + "loss": 0.3248, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.81640625, + "rewards/margins": 10.1015625, + "rewards/rejected": -7.296875, + "step": 2900 + }, + { + "epoch": 0.575623790862642, + "grad_norm": 32.85985560701251, + "learning_rate": 5.101589936787442e-07, + "logits/chosen": 3.703125, + "logits/rejected": 3.86328125, + "logps/chosen": -920.0, + "logps/rejected": -694.0, + "loss": 0.5212, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8955078125, + "rewards/margins": 5.6953125, + "rewards/rejected": -3.796875, + "step": 2901 + }, + { + "epoch": 0.5758222134034426, + "grad_norm": 30.08826568929044, + "learning_rate": 5.098485612284149e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.5859375, + "logps/chosen": -1509.0, + "logps/rejected": -854.0, + "loss": 0.2999, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7578125, + "rewards/margins": 9.3203125, + "rewards/rejected": -6.5625, + "step": 2902 + }, + { + "epoch": 0.5760206359442432, + "grad_norm": 31.329012605848828, + "learning_rate": 5.095381480380176e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.3046875, + "logps/chosen": -1008.0, + "logps/rejected": -826.0, + "loss": 0.3443, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.45703125, + "rewards/margins": 10.046875, + "rewards/rejected": -7.6171875, + "step": 2903 + }, + { + "epoch": 0.576219058485044, + "grad_norm": 31.693333522159516, + "learning_rate": 5.092277542564518e-07, + "logits/chosen": 3.7421875, + "logits/rejected": 3.53125, + "logps/chosen": -971.0, + "logps/rejected": -747.0, + "loss": 0.3748, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.888671875, + "rewards/margins": 8.546875, + "rewards/rejected": -5.66015625, + "step": 2904 + }, + { + "epoch": 0.5764174810258446, + "grad_norm": 33.076912593737376, + "learning_rate": 5.089173800326083e-07, + "logits/chosen": 4.37890625, + "logits/rejected": 4.12109375, + "logps/chosen": -1182.0, + "logps/rejected": -747.0, + "loss": 0.4254, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.697265625, + "rewards/margins": 7.10546875, + "rewards/rejected": -4.408203125, + "step": 2905 + }, + { + "epoch": 0.5766159035666452, + "grad_norm": 33.59272995431481, + "learning_rate": 5.086070255153676e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.38671875, + "logps/chosen": -1096.0, + "logps/rejected": -820.0, + "loss": 0.4925, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.1240234375, + "rewards/margins": 6.984375, + "rewards/rejected": -4.86328125, + "step": 2906 + }, + { + "epoch": 0.5768143261074458, + "grad_norm": 33.68583971650421, + "learning_rate": 5.082966908536017e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 4.07421875, + "logps/chosen": -650.0, + "logps/rejected": -548.5, + "loss": 0.3959, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.84375, + "rewards/margins": 5.9609375, + "rewards/rejected": -4.125, + "step": 2907 + }, + { + "epoch": 0.5770127486482465, + "grad_norm": 34.429719108930385, + "learning_rate": 5.079863761961723e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.17578125, + "logps/chosen": -855.0, + "logps/rejected": -561.5, + "loss": 0.4981, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.185546875, + "rewards/margins": 5.46484375, + "rewards/rejected": -3.259185791015625, + "step": 2908 + }, + { + "epoch": 0.5772111711890471, + "grad_norm": 34.054434951679454, + "learning_rate": 5.076760816919318e-07, + "logits/chosen": 4.6328125, + "logits/rejected": 4.2890625, + "logps/chosen": -990.0, + "logps/rejected": -617.0, + "loss": 0.4166, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.1015625, + "rewards/margins": 6.5859375, + "rewards/rejected": -4.48828125, + "step": 2909 + }, + { + "epoch": 0.5774095937298477, + "grad_norm": 26.72998483706989, + "learning_rate": 5.073658074897233e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.015625, + "logps/chosen": -992.0, + "logps/rejected": -773.0, + "loss": 0.3468, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.76171875, + "rewards/margins": 6.71875, + "rewards/rejected": -3.9609375, + "step": 2910 + }, + { + "epoch": 0.5776080162706484, + "grad_norm": 35.566052833881514, + "learning_rate": 5.070555537383794e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.1015625, + "logps/chosen": -836.0, + "logps/rejected": -727.0, + "loss": 0.5273, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.306640625, + "rewards/margins": 5.83984375, + "rewards/rejected": -3.541015625, + "step": 2911 + }, + { + "epoch": 0.577806438811449, + "grad_norm": 33.87950333774611, + "learning_rate": 5.067453205867238e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.7734375, + "logps/chosen": -912.0, + "logps/rejected": -686.0, + "loss": 0.3688, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.529296875, + "rewards/margins": 7.5078125, + "rewards/rejected": -4.96484375, + "step": 2912 + }, + { + "epoch": 0.5780048613522496, + "grad_norm": 28.120452586507025, + "learning_rate": 5.064351081835694e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 3.8203125, + "logps/chosen": -1012.5, + "logps/rejected": -714.5, + "loss": 0.2553, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.65234375, + "rewards/margins": 8.71875, + "rewards/rejected": -6.0390625, + "step": 2913 + }, + { + "epoch": 0.5782032838930502, + "grad_norm": 25.869225787295242, + "learning_rate": 5.061249166777198e-07, + "logits/chosen": 3.953125, + "logits/rejected": 3.859375, + "logps/chosen": -1004.0, + "logps/rejected": -724.0, + "loss": 0.4459, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8203125, + "rewards/margins": 7.38671875, + "rewards/rejected": -4.57421875, + "step": 2914 + }, + { + "epoch": 0.5784017064338509, + "grad_norm": 42.67036503873373, + "learning_rate": 5.058147462179683e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 3.9765625, + "logps/chosen": -980.0, + "logps/rejected": -679.0, + "loss": 0.338, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8671875, + "rewards/margins": 8.6484375, + "rewards/rejected": -5.77734375, + "step": 2915 + }, + { + "epoch": 0.5786001289746515, + "grad_norm": 26.607688721041605, + "learning_rate": 5.055045969530983e-07, + "logits/chosen": 4.140625, + "logits/rejected": 3.9609375, + "logps/chosen": -1167.0, + "logps/rejected": -737.0, + "loss": 0.3916, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.703125, + "rewards/margins": 7.296875, + "rewards/rejected": -4.6171875, + "step": 2916 + }, + { + "epoch": 0.5787985515154521, + "grad_norm": 29.73529302238861, + "learning_rate": 5.051944690318825e-07, + "logits/chosen": 3.8515625, + "logits/rejected": 3.796875, + "logps/chosen": -1291.0, + "logps/rejected": -798.0, + "loss": 0.2353, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.10546875, + "rewards/margins": 8.8828125, + "rewards/rejected": -5.76953125, + "step": 2917 + }, + { + "epoch": 0.5789969740562528, + "grad_norm": 32.957875305539176, + "learning_rate": 5.048843626030841e-07, + "logits/chosen": 4.484375, + "logits/rejected": 4.40625, + "logps/chosen": -1308.0, + "logps/rejected": -1157.5, + "loss": 0.2765, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.984375, + "rewards/margins": 11.9296875, + "rewards/rejected": -8.96875, + "step": 2918 + }, + { + "epoch": 0.5791953965970534, + "grad_norm": 25.91885482006006, + "learning_rate": 5.045742778154557e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 3.9375, + "logps/chosen": -681.0, + "logps/rejected": -501.25, + "loss": 0.684, + "rewards/accuracies": 0.59375, + "rewards/chosen": 1.45703125, + "rewards/margins": 3.458984375, + "rewards/rejected": -1.997802734375, + "step": 2919 + }, + { + "epoch": 0.579393819137854, + "grad_norm": 34.438395520392895, + "learning_rate": 5.042642148177391e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.27734375, + "logps/chosen": -1059.0, + "logps/rejected": -848.0, + "loss": 0.2911, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1171875, + "rewards/margins": 8.7734375, + "rewards/rejected": -5.64453125, + "step": 2920 + }, + { + "epoch": 0.5795922416786546, + "grad_norm": 24.0174354479439, + "learning_rate": 5.039541737586664e-07, + "logits/chosen": 3.5703125, + "logits/rejected": 3.578125, + "logps/chosen": -1027.5, + "logps/rejected": -808.5, + "loss": 0.4059, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.05859375, + "rewards/margins": 7.73828125, + "rewards/rejected": -4.68798828125, + "step": 2921 + }, + { + "epoch": 0.5797906642194554, + "grad_norm": 29.911240555475917, + "learning_rate": 5.036441547869586e-07, + "logits/chosen": 3.91015625, + "logits/rejected": 4.23046875, + "logps/chosen": -770.5, + "logps/rejected": -1201.0, + "loss": 0.4417, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.490234375, + "rewards/margins": 7.65625, + "rewards/rejected": -5.17578125, + "step": 2922 + }, + { + "epoch": 0.579989086760256, + "grad_norm": 39.08373564761005, + "learning_rate": 5.033341580513263e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 3.83203125, + "logps/chosen": -1131.0, + "logps/rejected": -1045.0, + "loss": 0.3665, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.640625, + "rewards/margins": 9.0, + "rewards/rejected": -6.3515625, + "step": 2923 + }, + { + "epoch": 0.5801875093010566, + "grad_norm": 31.41841393418895, + "learning_rate": 5.030241837004694e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 3.95703125, + "logps/chosen": -1003.0, + "logps/rejected": -695.5, + "loss": 0.3712, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.65234375, + "rewards/margins": 7.421875, + "rewards/rejected": -5.765625, + "step": 2924 + }, + { + "epoch": 0.5803859318418573, + "grad_norm": 30.47090744399558, + "learning_rate": 5.027142318830772e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.21875, + "logps/chosen": -822.0, + "logps/rejected": -679.0, + "loss": 0.4078, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.0068359375, + "rewards/margins": 7.546875, + "rewards/rejected": -5.5390625, + "step": 2925 + }, + { + "epoch": 0.5805843543826579, + "grad_norm": 33.06897502617434, + "learning_rate": 5.024043027478281e-07, + "logits/chosen": 4.578125, + "logits/rejected": 4.55078125, + "logps/chosen": -839.0, + "logps/rejected": -715.5, + "loss": 0.4105, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.990234375, + "rewards/margins": 7.4921875, + "rewards/rejected": -5.515625, + "step": 2926 + }, + { + "epoch": 0.5807827769234585, + "grad_norm": 33.032220327589016, + "learning_rate": 5.020943964433893e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.0859375, + "logps/chosen": -1289.0, + "logps/rejected": -754.5, + "loss": 0.328, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.521484375, + "rewards/margins": 9.578125, + "rewards/rejected": -6.0390625, + "step": 2927 + }, + { + "epoch": 0.5809811994642592, + "grad_norm": 29.99706560262437, + "learning_rate": 5.017845131184177e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.5, + "logps/chosen": -780.5, + "logps/rejected": -695.5, + "loss": 0.3621, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.158203125, + "rewards/margins": 7.6640625, + "rewards/rejected": -5.50390625, + "step": 2928 + }, + { + "epoch": 0.5811796220050598, + "grad_norm": 29.802082267191878, + "learning_rate": 5.014746529215587e-07, + "logits/chosen": 4.578125, + "logits/rejected": 4.3671875, + "logps/chosen": -704.5, + "logps/rejected": -561.5, + "loss": 0.4434, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.892578125, + "rewards/margins": 5.5859375, + "rewards/rejected": -3.69921875, + "step": 2929 + }, + { + "epoch": 0.5813780445458604, + "grad_norm": 32.372427157551414, + "learning_rate": 5.011648160014466e-07, + "logits/chosen": 4.37109375, + "logits/rejected": 4.21875, + "logps/chosen": -937.0, + "logps/rejected": -737.5, + "loss": 0.3024, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.40234375, + "rewards/margins": 9.03125, + "rewards/rejected": -6.62109375, + "step": 2930 + }, + { + "epoch": 0.581576467086661, + "grad_norm": 32.02753904985036, + "learning_rate": 5.008550025067048e-07, + "logits/chosen": 4.3828125, + "logits/rejected": 4.44140625, + "logps/chosen": -894.0, + "logps/rejected": -664.0, + "loss": 0.3868, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.6826171875, + "rewards/margins": 7.6640625, + "rewards/rejected": -5.9765625, + "step": 2931 + }, + { + "epoch": 0.5817748896274617, + "grad_norm": 35.2284490624616, + "learning_rate": 5.005452125859456e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.109375, + "logps/chosen": -995.0, + "logps/rejected": -806.0, + "loss": 0.4993, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.822265625, + "rewards/margins": 6.2109375, + "rewards/rejected": -4.390625, + "step": 2932 + }, + { + "epoch": 0.5819733121682623, + "grad_norm": 26.175183258119112, + "learning_rate": 5.002354463877692e-07, + "logits/chosen": 3.984375, + "logits/rejected": 4.1875, + "logps/chosen": -963.0, + "logps/rejected": -802.0, + "loss": 0.4665, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.62109375, + "rewards/margins": 8.1484375, + "rewards/rejected": -5.53515625, + "step": 2933 + }, + { + "epoch": 0.5821717347090629, + "grad_norm": 29.83355538221672, + "learning_rate": 4.999257040607651e-07, + "logits/chosen": 4.59375, + "logits/rejected": 4.24609375, + "logps/chosen": -1006.0, + "logps/rejected": -570.0, + "loss": 0.3819, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.525390625, + "rewards/margins": 7.8125, + "rewards/rejected": -5.2666015625, + "step": 2934 + }, + { + "epoch": 0.5823701572498636, + "grad_norm": 26.202300768766268, + "learning_rate": 4.996159857535115e-07, + "logits/chosen": 4.53125, + "logits/rejected": 4.24609375, + "logps/chosen": -775.0, + "logps/rejected": -586.5, + "loss": 0.3688, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.361328125, + "rewards/margins": 7.5859375, + "rewards/rejected": -5.23828125, + "step": 2935 + }, + { + "epoch": 0.5825685797906642, + "grad_norm": 25.57659008587677, + "learning_rate": 4.993062916145744e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.3125, + "logps/chosen": -895.0, + "logps/rejected": -514.5, + "loss": 0.4101, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.12890625, + "rewards/margins": 8.0703125, + "rewards/rejected": -5.9453125, + "step": 2936 + }, + { + "epoch": 0.5827670023314648, + "grad_norm": 27.85346118331617, + "learning_rate": 4.989966217925089e-07, + "logits/chosen": 4.28125, + "logits/rejected": 3.94140625, + "logps/chosen": -833.0, + "logps/rejected": -576.5, + "loss": 0.4088, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.45703125, + "rewards/margins": 6.9140625, + "rewards/rejected": -4.45703125, + "step": 2937 + }, + { + "epoch": 0.5829654248722654, + "grad_norm": 39.59099201368519, + "learning_rate": 4.986869764358578e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 4.12890625, + "logps/chosen": -713.5, + "logps/rejected": -682.5, + "loss": 0.529, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.296875, + "rewards/margins": 5.84765625, + "rewards/rejected": -3.544921875, + "step": 2938 + }, + { + "epoch": 0.5831638474130661, + "grad_norm": 25.30371769011437, + "learning_rate": 4.983773556931527e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.109375, + "logps/chosen": -779.5, + "logps/rejected": -563.5, + "loss": 0.3896, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.53125, + "rewards/margins": 5.96875, + "rewards/rejected": -3.44140625, + "step": 2939 + }, + { + "epoch": 0.5833622699538668, + "grad_norm": 27.98122175827763, + "learning_rate": 4.980677597129129e-07, + "logits/chosen": 3.64453125, + "logits/rejected": 3.53515625, + "logps/chosen": -874.0, + "logps/rejected": -922.0, + "loss": 0.497, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.10546875, + "rewards/margins": 15.953125, + "rewards/rejected": -13.8828125, + "step": 2940 + }, + { + "epoch": 0.5835606924946674, + "grad_norm": 34.901754292147274, + "learning_rate": 4.977581886436462e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.0703125, + "logps/chosen": -901.0, + "logps/rejected": -800.0, + "loss": 0.5043, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.0859375, + "rewards/margins": 7.046875, + "rewards/rejected": -4.953125, + "step": 2941 + }, + { + "epoch": 0.5837591150354681, + "grad_norm": 32.033391535743675, + "learning_rate": 4.974486426338489e-07, + "logits/chosen": 4.375, + "logits/rejected": 3.9609375, + "logps/chosen": -899.0, + "logps/rejected": -710.5, + "loss": 0.4304, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4921875, + "rewards/margins": 7.1640625, + "rewards/rejected": -4.66796875, + "step": 2942 + }, + { + "epoch": 0.5839575375762687, + "grad_norm": 36.92568458895008, + "learning_rate": 4.971391218320039e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 3.90625, + "logps/chosen": -825.5, + "logps/rejected": -570.0, + "loss": 0.3582, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.34375, + "rewards/margins": 6.2421875, + "rewards/rejected": -2.892578125, + "step": 2943 + }, + { + "epoch": 0.5841559601170693, + "grad_norm": 39.51039562801104, + "learning_rate": 4.968296263865832e-07, + "logits/chosen": 3.8828125, + "logits/rejected": 4.0625, + "logps/chosen": -1101.5, + "logps/rejected": -723.5, + "loss": 0.3544, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.55078125, + "rewards/margins": 6.671875, + "rewards/rejected": -4.11328125, + "step": 2944 + }, + { + "epoch": 0.58435438265787, + "grad_norm": 39.32795147959505, + "learning_rate": 4.965201564460463e-07, + "logits/chosen": 4.4765625, + "logits/rejected": 4.20703125, + "logps/chosen": -1202.0, + "logps/rejected": -615.5, + "loss": 0.4357, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.80615234375, + "rewards/margins": 6.5078125, + "rewards/rejected": -4.6953125, + "step": 2945 + }, + { + "epoch": 0.5845528051986706, + "grad_norm": 31.283719211514665, + "learning_rate": 4.962107121588405e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 3.90234375, + "logps/chosen": -665.0, + "logps/rejected": -632.0, + "loss": 0.5663, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.095703125, + "rewards/margins": 5.13671875, + "rewards/rejected": -3.037109375, + "step": 2946 + }, + { + "epoch": 0.5847512277394712, + "grad_norm": 46.32597213022499, + "learning_rate": 4.959012936734004e-07, + "logits/chosen": 4.37109375, + "logits/rejected": 4.27734375, + "logps/chosen": -1850.0, + "logps/rejected": -925.0, + "loss": 0.4302, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.890625, + "rewards/margins": 7.41015625, + "rewards/rejected": -6.53125, + "step": 2947 + }, + { + "epoch": 0.5849496502802718, + "grad_norm": 28.07221000805209, + "learning_rate": 4.955919011381489e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 3.96484375, + "logps/chosen": -822.0, + "logps/rejected": -505.5, + "loss": 0.4226, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.455078125, + "rewards/margins": 6.73828125, + "rewards/rejected": -4.28515625, + "step": 2948 + }, + { + "epoch": 0.5851480728210725, + "grad_norm": 25.405377956287676, + "learning_rate": 4.952825347014959e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.015625, + "logps/chosen": -925.0, + "logps/rejected": -689.0, + "loss": 0.3141, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.66015625, + "rewards/margins": 13.609375, + "rewards/rejected": -10.953125, + "step": 2949 + }, + { + "epoch": 0.5853464953618731, + "grad_norm": 32.24361455319709, + "learning_rate": 4.949731945118389e-07, + "logits/chosen": 4.0625, + "logits/rejected": 3.921875, + "logps/chosen": -1131.0, + "logps/rejected": -1039.0, + "loss": 0.3296, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.171875, + "rewards/margins": 9.9296875, + "rewards/rejected": -6.765625, + "step": 2950 + }, + { + "epoch": 0.5855449179026737, + "grad_norm": 28.428789210427013, + "learning_rate": 4.946638807175632e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 3.8515625, + "logps/chosen": -746.0, + "logps/rejected": -674.0, + "loss": 0.5155, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.2587890625, + "rewards/margins": 6.28515625, + "rewards/rejected": -4.01171875, + "step": 2951 + }, + { + "epoch": 0.5857433404434744, + "grad_norm": 34.69626871076542, + "learning_rate": 4.943545934670409e-07, + "logits/chosen": 3.796875, + "logits/rejected": 3.7265625, + "logps/chosen": -948.0, + "logps/rejected": -904.0, + "loss": 0.428, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.048828125, + "rewards/margins": 7.65625, + "rewards/rejected": -5.60546875, + "step": 2952 + }, + { + "epoch": 0.585941762984275, + "grad_norm": 27.400140347579267, + "learning_rate": 4.940453329086318e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.265625, + "logps/chosen": -850.5, + "logps/rejected": -634.0, + "loss": 0.4104, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.162109375, + "rewards/margins": 6.9921875, + "rewards/rejected": -4.828125, + "step": 2953 + }, + { + "epoch": 0.5861401855250756, + "grad_norm": 27.909877785877732, + "learning_rate": 4.937360991906824e-07, + "logits/chosen": 4.71875, + "logits/rejected": 4.5234375, + "logps/chosen": -1233.0, + "logps/rejected": -764.0, + "loss": 0.353, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2265625, + "rewards/margins": 8.5859375, + "rewards/rejected": -5.359375, + "step": 2954 + }, + { + "epoch": 0.5863386080658762, + "grad_norm": 26.687499851541194, + "learning_rate": 4.934268924615268e-07, + "logits/chosen": 4.29296875, + "logits/rejected": 4.12109375, + "logps/chosen": -1265.0, + "logps/rejected": -792.0, + "loss": 0.3484, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.876953125, + "rewards/margins": 8.9765625, + "rewards/rejected": -6.0859375, + "step": 2955 + }, + { + "epoch": 0.586537030606677, + "grad_norm": 45.97231577348457, + "learning_rate": 4.93117712869486e-07, + "logits/chosen": 3.7734375, + "logits/rejected": 3.6875, + "logps/chosen": -937.0, + "logps/rejected": -700.0, + "loss": 0.4397, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.326171875, + "rewards/margins": 7.578125, + "rewards/rejected": -5.25, + "step": 2956 + }, + { + "epoch": 0.5867354531474775, + "grad_norm": 25.048493677334864, + "learning_rate": 4.928085605628678e-07, + "logits/chosen": 3.734375, + "logits/rejected": 4.03515625, + "logps/chosen": -781.0, + "logps/rejected": -1377.0, + "loss": 0.4298, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.412109375, + "rewards/margins": 9.0859375, + "rewards/rejected": -6.671875, + "step": 2957 + }, + { + "epoch": 0.5869338756882782, + "grad_norm": 36.56723581220345, + "learning_rate": 4.924994356899675e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 3.9140625, + "logps/chosen": -935.0, + "logps/rejected": -669.5, + "loss": 0.4056, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.859375, + "rewards/margins": 7.9765625, + "rewards/rejected": -5.1171875, + "step": 2958 + }, + { + "epoch": 0.5871322982290789, + "grad_norm": 31.942795989303857, + "learning_rate": 4.921903383990664e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.0703125, + "logps/chosen": -1160.0, + "logps/rejected": -794.5, + "loss": 0.3049, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2109375, + "rewards/margins": 8.90625, + "rewards/rejected": -5.703125, + "step": 2959 + }, + { + "epoch": 0.5873307207698795, + "grad_norm": 40.67564050477791, + "learning_rate": 4.918812688384329e-07, + "logits/chosen": 4.33203125, + "logits/rejected": 4.5390625, + "logps/chosen": -955.0, + "logps/rejected": -1054.5, + "loss": 0.5703, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.0849609375, + "rewards/margins": 7.0361328125, + "rewards/rejected": -5.966796875, + "step": 2960 + }, + { + "epoch": 0.5875291433106801, + "grad_norm": 27.021773929615534, + "learning_rate": 4.915722271563224e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 3.859375, + "logps/chosen": -850.0, + "logps/rejected": -529.0, + "loss": 0.3455, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9609375, + "rewards/margins": 7.4140625, + "rewards/rejected": -4.4609375, + "step": 2961 + }, + { + "epoch": 0.5877275658514808, + "grad_norm": 35.27743799203298, + "learning_rate": 4.912632135009769e-07, + "logits/chosen": 3.76171875, + "logits/rejected": 3.72265625, + "logps/chosen": -1111.0, + "logps/rejected": -785.0, + "loss": 0.4012, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.26953125, + "rewards/margins": 6.59375, + "rewards/rejected": -4.3203125, + "step": 2962 + }, + { + "epoch": 0.5879259883922814, + "grad_norm": 34.770025870932294, + "learning_rate": 4.909542280206242e-07, + "logits/chosen": 3.75, + "logits/rejected": 3.796875, + "logps/chosen": -1217.0, + "logps/rejected": -973.0, + "loss": 0.3669, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.546875, + "rewards/margins": 8.390625, + "rewards/rejected": -5.828125, + "step": 2963 + }, + { + "epoch": 0.588124410933082, + "grad_norm": 33.66632220228197, + "learning_rate": 4.906452708634797e-07, + "logits/chosen": 3.38671875, + "logits/rejected": 3.21484375, + "logps/chosen": -980.0, + "logps/rejected": -784.0, + "loss": 0.3042, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.703125, + "rewards/margins": 8.859375, + "rewards/rejected": -6.1640625, + "step": 2964 + }, + { + "epoch": 0.5883228334738826, + "grad_norm": 32.878340100801026, + "learning_rate": 4.903363421777444e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 3.9453125, + "logps/chosen": -1149.0, + "logps/rejected": -694.5, + "loss": 0.3603, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.560546875, + "rewards/margins": 6.890625, + "rewards/rejected": -4.33984375, + "step": 2965 + }, + { + "epoch": 0.5885212560146833, + "grad_norm": 27.283481761984643, + "learning_rate": 4.900274421116058e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.921875, + "logps/chosen": -1201.0, + "logps/rejected": -887.0, + "loss": 0.3158, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1328125, + "rewards/margins": 8.359375, + "rewards/rejected": -5.25, + "step": 2966 + }, + { + "epoch": 0.5887196785554839, + "grad_norm": 36.19414860508901, + "learning_rate": 4.897185708132381e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 3.8515625, + "logps/chosen": -1282.0, + "logps/rejected": -1019.0, + "loss": 0.3424, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.19140625, + "rewards/margins": 9.1875, + "rewards/rejected": -6.9921875, + "step": 2967 + }, + { + "epoch": 0.5889181010962845, + "grad_norm": 26.867537630350384, + "learning_rate": 4.89409728430801e-07, + "logits/chosen": 4.9375, + "logits/rejected": 4.6953125, + "logps/chosen": -956.0, + "logps/rejected": -868.0, + "loss": 0.4746, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.73046875, + "rewards/margins": 6.84375, + "rewards/rejected": -4.1015625, + "step": 2968 + }, + { + "epoch": 0.5891165236370852, + "grad_norm": 42.33243232483165, + "learning_rate": 4.891009151124411e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.5390625, + "logps/chosen": -968.0, + "logps/rejected": -698.5, + "loss": 0.2758, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.94140625, + "rewards/margins": 8.953125, + "rewards/rejected": -6.00390625, + "step": 2969 + }, + { + "epoch": 0.5893149461778858, + "grad_norm": 21.605662216924998, + "learning_rate": 4.887921310062903e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.26953125, + "logps/chosen": -970.0, + "logps/rejected": -1124.0, + "loss": 0.3802, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.62109375, + "rewards/margins": 10.0, + "rewards/rejected": -7.396484375, + "step": 2970 + }, + { + "epoch": 0.5895133687186864, + "grad_norm": 35.647287304961466, + "learning_rate": 4.884833762604671e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 4.11328125, + "logps/chosen": -850.0, + "logps/rejected": -843.0, + "loss": 0.5106, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.12109375, + "rewards/margins": 6.703125, + "rewards/rejected": -4.59375, + "step": 2971 + }, + { + "epoch": 0.589711791259487, + "grad_norm": 33.47259988592813, + "learning_rate": 4.881746510230754e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.46875, + "logps/chosen": -840.5, + "logps/rejected": -822.0, + "loss": 0.464, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9580078125, + "rewards/margins": 6.65625, + "rewards/rejected": -4.70703125, + "step": 2972 + }, + { + "epoch": 0.5899102138002877, + "grad_norm": 31.90597162057293, + "learning_rate": 4.878659554422054e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 3.76171875, + "logps/chosen": -981.0, + "logps/rejected": -719.0, + "loss": 0.3915, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.2734375, + "rewards/margins": 7.8203125, + "rewards/rejected": -5.55859375, + "step": 2973 + }, + { + "epoch": 0.5901086363410883, + "grad_norm": 28.70657268878658, + "learning_rate": 4.875572896659331e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 4.1015625, + "logps/chosen": -868.0, + "logps/rejected": -648.5, + "loss": 0.4058, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.173828125, + "rewards/margins": 7.6875, + "rewards/rejected": -5.53125, + "step": 2974 + }, + { + "epoch": 0.590307058881889, + "grad_norm": 30.160356818255604, + "learning_rate": 4.872486538423195e-07, + "logits/chosen": 3.80859375, + "logits/rejected": 4.01953125, + "logps/chosen": -989.5, + "logps/rejected": -723.0, + "loss": 0.3682, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.1591796875, + "rewards/margins": 7.4609375, + "rewards/rejected": -5.3046875, + "step": 2975 + }, + { + "epoch": 0.5905054814226897, + "grad_norm": 24.501975717361226, + "learning_rate": 4.869400481194122e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 4.0, + "logps/chosen": -1239.0, + "logps/rejected": -941.0, + "loss": 0.3375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1484375, + "rewards/margins": 9.375, + "rewards/rejected": -6.21875, + "step": 2976 + }, + { + "epoch": 0.5907039039634903, + "grad_norm": 31.998751602086518, + "learning_rate": 4.866314726452435e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.79296875, + "logps/chosen": -838.0, + "logps/rejected": -920.5, + "loss": 0.3973, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.12890625, + "rewards/margins": 8.0390625, + "rewards/rejected": -5.890625, + "step": 2977 + }, + { + "epoch": 0.5909023265042909, + "grad_norm": 29.483065594608362, + "learning_rate": 4.86322927567832e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.44140625, + "logps/chosen": -1004.0, + "logps/rejected": -1238.0, + "loss": 0.4799, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.220703125, + "rewards/margins": 9.1796875, + "rewards/rejected": -6.9609375, + "step": 2978 + }, + { + "epoch": 0.5911007490450915, + "grad_norm": 32.25791822909822, + "learning_rate": 4.860144130351807e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 3.89453125, + "logps/chosen": -1140.0, + "logps/rejected": -1680.5, + "loss": 0.4695, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.064453125, + "rewards/margins": 11.3671875, + "rewards/rejected": -8.28125, + "step": 2979 + }, + { + "epoch": 0.5912991715858922, + "grad_norm": 36.131067856948206, + "learning_rate": 4.857059291952791e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.34375, + "logps/chosen": -1070.0, + "logps/rejected": -938.0, + "loss": 0.3179, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.60546875, + "rewards/margins": 9.8125, + "rewards/rejected": -7.1875, + "step": 2980 + }, + { + "epoch": 0.5914975941266928, + "grad_norm": 30.722052522654568, + "learning_rate": 4.85397476196101e-07, + "logits/chosen": 3.5546875, + "logits/rejected": 3.6640625, + "logps/chosen": -1215.0, + "logps/rejected": -1314.0, + "loss": 0.3228, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.51953125, + "rewards/margins": 11.3515625, + "rewards/rejected": -8.8359375, + "step": 2981 + }, + { + "epoch": 0.5916960166674934, + "grad_norm": 41.45964523503518, + "learning_rate": 4.850890541856061e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 3.76953125, + "logps/chosen": -1171.0, + "logps/rejected": -561.5, + "loss": 0.3755, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.390625, + "rewards/margins": 7.6015625, + "rewards/rejected": -5.2109375, + "step": 2982 + }, + { + "epoch": 0.5918944392082941, + "grad_norm": 26.555350906289856, + "learning_rate": 4.847806633117391e-07, + "logits/chosen": 3.85546875, + "logits/rejected": 4.234375, + "logps/chosen": -961.0, + "logps/rejected": -857.0, + "loss": 0.3334, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.50390625, + "rewards/margins": 8.5625, + "rewards/rejected": -5.0625, + "step": 2983 + }, + { + "epoch": 0.5920928617490947, + "grad_norm": 28.032492394382462, + "learning_rate": 4.844723037224291e-07, + "logits/chosen": 4.73046875, + "logits/rejected": 4.515625, + "logps/chosen": -1454.0, + "logps/rejected": -795.0, + "loss": 0.3173, + "rewards/accuracies": 0.90625, + "rewards/chosen": 4.08203125, + "rewards/margins": 9.078125, + "rewards/rejected": -4.9921875, + "step": 2984 + }, + { + "epoch": 0.5922912842898953, + "grad_norm": 29.44302164902955, + "learning_rate": 4.841639755655911e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 4.13671875, + "logps/chosen": -871.0, + "logps/rejected": -615.0, + "loss": 0.4491, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.88671875, + "rewards/margins": 6.9453125, + "rewards/rejected": -4.06640625, + "step": 2985 + }, + { + "epoch": 0.592489706830696, + "grad_norm": 36.80575485661895, + "learning_rate": 4.838556789891245e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 4.09765625, + "logps/chosen": -853.0, + "logps/rejected": -742.0, + "loss": 0.4274, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.46484375, + "rewards/margins": 7.50390625, + "rewards/rejected": -5.046875, + "step": 2986 + }, + { + "epoch": 0.5926881293714966, + "grad_norm": 32.02482112053613, + "learning_rate": 4.835474141409139e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.94921875, + "logps/chosen": -1006.0, + "logps/rejected": -896.0, + "loss": 0.4341, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8984375, + "rewards/margins": 8.71875, + "rewards/rejected": -5.828125, + "step": 2987 + }, + { + "epoch": 0.5928865519122972, + "grad_norm": 29.176897264375185, + "learning_rate": 4.832391811688282e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 3.8359375, + "logps/chosen": -918.0, + "logps/rejected": -591.5, + "loss": 0.4163, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.552734375, + "rewards/margins": 7.140625, + "rewards/rejected": -4.6015625, + "step": 2988 + }, + { + "epoch": 0.5930849744530978, + "grad_norm": 30.075928178618803, + "learning_rate": 4.829309802207214e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.1796875, + "logps/chosen": -948.0, + "logps/rejected": -574.5, + "loss": 0.6144, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.265625, + "rewards/margins": 5.1015625, + "rewards/rejected": -2.8359375, + "step": 2989 + }, + { + "epoch": 0.5932833969938985, + "grad_norm": 47.609721445111184, + "learning_rate": 4.826228114444321e-07, + "logits/chosen": 4.66015625, + "logits/rejected": 4.45703125, + "logps/chosen": -1433.0, + "logps/rejected": -863.0, + "loss": 0.3991, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.927734375, + "rewards/margins": 8.25, + "rewards/rejected": -5.3359375, + "step": 2990 + }, + { + "epoch": 0.5934818195346991, + "grad_norm": 40.37445683478927, + "learning_rate": 4.823146749877833e-07, + "logits/chosen": 4.5234375, + "logits/rejected": 4.23046875, + "logps/chosen": -1033.0, + "logps/rejected": -717.0, + "loss": 0.3631, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.994140625, + "rewards/margins": 7.671875, + "rewards/rejected": -4.67578125, + "step": 2991 + }, + { + "epoch": 0.5936802420754997, + "grad_norm": 30.388690763677843, + "learning_rate": 4.820065709985826e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 4.03515625, + "logps/chosen": -796.5, + "logps/rejected": -610.0, + "loss": 0.4806, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.87109375, + "rewards/margins": 5.921875, + "rewards/rejected": -4.05078125, + "step": 2992 + }, + { + "epoch": 0.5938786646163005, + "grad_norm": 28.62515993194937, + "learning_rate": 4.816984996246219e-07, + "logits/chosen": 4.49609375, + "logits/rejected": 4.375, + "logps/chosen": -833.5, + "logps/rejected": -578.0, + "loss": 0.4419, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.49609375, + "rewards/margins": 6.25390625, + "rewards/rejected": -3.75390625, + "step": 2993 + }, + { + "epoch": 0.5940770871571011, + "grad_norm": 37.66634678645396, + "learning_rate": 4.81390461013678e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 4.1484375, + "logps/chosen": -1008.0, + "logps/rejected": -1075.5, + "loss": 0.3189, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.291015625, + "rewards/margins": 9.25, + "rewards/rejected": -7.0, + "step": 2994 + }, + { + "epoch": 0.5942755096979017, + "grad_norm": 28.389774386402983, + "learning_rate": 4.810824553135109e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.26171875, + "logps/chosen": -1024.0, + "logps/rejected": -812.0, + "loss": 0.3005, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.564453125, + "rewards/margins": 10.125, + "rewards/rejected": -6.5546875, + "step": 2995 + }, + { + "epoch": 0.5944739322387023, + "grad_norm": 24.688835745646717, + "learning_rate": 4.807744826718659e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.296875, + "logps/chosen": -865.5, + "logps/rejected": -1900.5, + "loss": 0.3646, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.521484375, + "rewards/margins": 11.328125, + "rewards/rejected": -8.796875, + "step": 2996 + }, + { + "epoch": 0.594672354779503, + "grad_norm": 36.51842132696794, + "learning_rate": 4.804665432364719e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 4.22265625, + "logps/chosen": -666.5, + "logps/rejected": -796.0, + "loss": 0.5344, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.095703125, + "rewards/margins": 6.5546875, + "rewards/rejected": -4.4609375, + "step": 2997 + }, + { + "epoch": 0.5948707773203036, + "grad_norm": 36.68977904899599, + "learning_rate": 4.801586371550418e-07, + "logits/chosen": 3.703125, + "logits/rejected": 3.5625, + "logps/chosen": -1088.0, + "logps/rejected": -649.0, + "loss": 0.2373, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8046875, + "rewards/margins": 10.140625, + "rewards/rejected": -7.3359375, + "step": 2998 + }, + { + "epoch": 0.5950691998611042, + "grad_norm": 35.00629523788551, + "learning_rate": 4.798507645752731e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.2734375, + "logps/chosen": -1255.5, + "logps/rejected": -659.0, + "loss": 0.3491, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.494140625, + "rewards/margins": 8.1328125, + "rewards/rejected": -5.640625, + "step": 2999 + }, + { + "epoch": 0.5952676224019049, + "grad_norm": 31.912224631156974, + "learning_rate": 4.795429256448464e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.203125, + "logps/chosen": -991.0, + "logps/rejected": -668.0, + "loss": 0.4624, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.23828125, + "rewards/margins": 6.28125, + "rewards/rejected": -4.05078125, + "step": 3000 + }, + { + "epoch": 0.5954660449427055, + "grad_norm": 27.515593384686653, + "learning_rate": 4.792351205114267e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.04296875, + "logps/chosen": -866.0, + "logps/rejected": -802.0, + "loss": 0.4562, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.341796875, + "rewards/margins": 5.88671875, + "rewards/rejected": -3.544921875, + "step": 3001 + }, + { + "epoch": 0.5956644674835061, + "grad_norm": 40.801061737856244, + "learning_rate": 4.789273493226627e-07, + "logits/chosen": 3.8828125, + "logits/rejected": 3.8984375, + "logps/chosen": -857.0, + "logps/rejected": -671.5, + "loss": 0.4787, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.2529296875, + "rewards/margins": 6.87109375, + "rewards/rejected": -4.609375, + "step": 3002 + }, + { + "epoch": 0.5958628900243068, + "grad_norm": 34.13176734374222, + "learning_rate": 4.786196122261869e-07, + "logits/chosen": 4.5234375, + "logits/rejected": 4.17578125, + "logps/chosen": -821.5, + "logps/rejected": -774.0, + "loss": 0.3633, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.201171875, + "rewards/margins": 10.53125, + "rewards/rejected": -9.3359375, + "step": 3003 + }, + { + "epoch": 0.5960613125651074, + "grad_norm": 30.23017417139539, + "learning_rate": 4.783119093696148e-07, + "logits/chosen": 4.43359375, + "logits/rejected": 4.03125, + "logps/chosen": -1004.0, + "logps/rejected": -716.5, + "loss": 0.3574, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.345703125, + "rewards/margins": 7.70703125, + "rewards/rejected": -5.357421875, + "step": 3004 + }, + { + "epoch": 0.596259735105908, + "grad_norm": 33.41735943389238, + "learning_rate": 4.780042409005466e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 3.6640625, + "logps/chosen": -699.0, + "logps/rejected": -565.0, + "loss": 0.5164, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.50146484375, + "rewards/margins": 5.4921875, + "rewards/rejected": -3.984375, + "step": 3005 + }, + { + "epoch": 0.5964581576467086, + "grad_norm": 34.920893386545025, + "learning_rate": 4.776966069665653e-07, + "logits/chosen": 4.3984375, + "logits/rejected": 4.5390625, + "logps/chosen": -1007.0, + "logps/rejected": -759.0, + "loss": 0.4393, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.1328125, + "rewards/margins": 6.8984375, + "rewards/rejected": -4.76171875, + "step": 3006 + }, + { + "epoch": 0.5966565801875093, + "grad_norm": 61.74559181523238, + "learning_rate": 4.77389007715237e-07, + "logits/chosen": 3.921875, + "logits/rejected": 3.44921875, + "logps/chosen": -1121.0, + "logps/rejected": -775.0, + "loss": 0.3683, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.30078125, + "rewards/margins": 7.1640625, + "rewards/rejected": -4.86328125, + "step": 3007 + }, + { + "epoch": 0.5968550027283099, + "grad_norm": 34.55998540084939, + "learning_rate": 4.770814432941122e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.09375, + "logps/chosen": -1023.0, + "logps/rejected": -910.0, + "loss": 0.3454, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.9375, + "rewards/margins": 8.484375, + "rewards/rejected": -6.5390625, + "step": 3008 + }, + { + "epoch": 0.5970534252691105, + "grad_norm": 32.64615730715272, + "learning_rate": 4.7677391385072353e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.40234375, + "logps/chosen": -667.0, + "logps/rejected": -876.5, + "loss": 0.3446, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.0, + "rewards/margins": 8.296875, + "rewards/rejected": -6.3125, + "step": 3009 + }, + { + "epoch": 0.5972518478099113, + "grad_norm": 29.96482056754343, + "learning_rate": 4.764664195325877e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 3.83984375, + "logps/chosen": -686.5, + "logps/rejected": -580.5, + "loss": 0.553, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.46484375, + "rewards/margins": 5.8046875, + "rewards/rejected": -4.33984375, + "step": 3010 + }, + { + "epoch": 0.5974502703507119, + "grad_norm": 32.29541998866303, + "learning_rate": 4.761589604872042e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 3.76953125, + "logps/chosen": -934.0, + "logps/rejected": -664.0, + "loss": 0.3507, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.55078125, + "rewards/margins": 7.2421875, + "rewards/rejected": -4.6875, + "step": 3011 + }, + { + "epoch": 0.5976486928915125, + "grad_norm": 34.95823327556825, + "learning_rate": 4.758515368620558e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 3.86328125, + "logps/chosen": -1061.0, + "logps/rejected": -823.0, + "loss": 0.4103, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.234375, + "rewards/margins": 7.625, + "rewards/rejected": -5.390625, + "step": 3012 + }, + { + "epoch": 0.5978471154323131, + "grad_norm": 31.006553778518818, + "learning_rate": 4.755441488046079e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 3.91015625, + "logps/chosen": -861.0, + "logps/rejected": -577.5, + "loss": 0.4401, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.982421875, + "rewards/margins": 6.046875, + "rewards/rejected": -4.078125, + "step": 3013 + }, + { + "epoch": 0.5980455379731138, + "grad_norm": 34.2383118109388, + "learning_rate": 4.752367964623094e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.25390625, + "logps/chosen": -847.0, + "logps/rejected": -641.0, + "loss": 0.5631, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.94140625, + "rewards/margins": 5.2890625, + "rewards/rejected": -3.34765625, + "step": 3014 + }, + { + "epoch": 0.5982439605139144, + "grad_norm": 31.38358526922958, + "learning_rate": 4.7492947998259157e-07, + "logits/chosen": 3.796875, + "logits/rejected": 4.06640625, + "logps/chosen": -814.0, + "logps/rejected": -749.0, + "loss": 0.4401, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.8466796875, + "rewards/margins": 6.2578125, + "rewards/rejected": -4.41796875, + "step": 3015 + }, + { + "epoch": 0.598442383054715, + "grad_norm": 34.70584708018877, + "learning_rate": 4.7462219951286864e-07, + "logits/chosen": 4.46484375, + "logits/rejected": 4.3984375, + "logps/chosen": -985.0, + "logps/rejected": -733.5, + "loss": 0.4407, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.498046875, + "rewards/margins": 6.3359375, + "rewards/rejected": -3.84375, + "step": 3016 + }, + { + "epoch": 0.5986408055955157, + "grad_norm": 28.876311403917892, + "learning_rate": 4.7431495520053764e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.8671875, + "logps/chosen": -1194.0, + "logps/rejected": -890.0, + "loss": 0.4505, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.798828125, + "rewards/margins": 7.25, + "rewards/rejected": -5.4453125, + "step": 3017 + }, + { + "epoch": 0.5988392281363163, + "grad_norm": 31.42847324562956, + "learning_rate": 4.740077471929782e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.26953125, + "logps/chosen": -1064.5, + "logps/rejected": -1108.0, + "loss": 0.3683, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1640625, + "rewards/margins": 10.28125, + "rewards/rejected": -7.1171875, + "step": 3018 + }, + { + "epoch": 0.5990376506771169, + "grad_norm": 25.411238353880744, + "learning_rate": 4.7370057563755273e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 3.9140625, + "logps/chosen": -644.0, + "logps/rejected": -468.5, + "loss": 0.41, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.341796875, + "rewards/margins": 5.7890625, + "rewards/rejected": -3.44921875, + "step": 3019 + }, + { + "epoch": 0.5992360732179176, + "grad_norm": 34.98700692605508, + "learning_rate": 4.733934406816056e-07, + "logits/chosen": 3.796875, + "logits/rejected": 4.0, + "logps/chosen": -1519.0, + "logps/rejected": -1277.0, + "loss": 0.4231, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4921875, + "rewards/margins": 7.71875, + "rewards/rejected": -6.240234375, + "step": 3020 + }, + { + "epoch": 0.5994344957587182, + "grad_norm": 33.14319766781182, + "learning_rate": 4.7308634247246426e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 3.9140625, + "logps/chosen": -1074.0, + "logps/rejected": -656.5, + "loss": 0.4471, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.197265625, + "rewards/margins": 6.46875, + "rewards/rejected": -4.26953125, + "step": 3021 + }, + { + "epoch": 0.5996329182995188, + "grad_norm": 33.40193805568249, + "learning_rate": 4.7277928115743824e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.65625, + "logps/chosen": -1016.0, + "logps/rejected": -728.0, + "loss": 0.3324, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5234375, + "rewards/margins": 7.90625, + "rewards/rejected": -5.390625, + "step": 3022 + }, + { + "epoch": 0.5998313408403194, + "grad_norm": 32.217751255909, + "learning_rate": 4.724722568838192e-07, + "logits/chosen": 4.05078125, + "logits/rejected": 4.25390625, + "logps/chosen": -900.0, + "logps/rejected": -784.5, + "loss": 0.3581, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.67578125, + "rewards/margins": 7.49609375, + "rewards/rejected": -4.822265625, + "step": 3023 + }, + { + "epoch": 0.6000297633811201, + "grad_norm": 37.74419143149029, + "learning_rate": 4.7216526979888163e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.2890625, + "logps/chosen": -985.0, + "logps/rejected": -724.0, + "loss": 0.391, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.1640625, + "rewards/margins": 7.654296875, + "rewards/rejected": -5.498046875, + "step": 3024 + }, + { + "epoch": 0.6002281859219207, + "grad_norm": 28.965021319552022, + "learning_rate": 4.7185832004988133e-07, + "logits/chosen": 3.79296875, + "logits/rejected": 3.6171875, + "logps/chosen": -991.0, + "logps/rejected": -649.0, + "loss": 0.3619, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.607421875, + "rewards/margins": 7.6171875, + "rewards/rejected": -5.0, + "step": 3025 + }, + { + "epoch": 0.6004266084627213, + "grad_norm": 29.47198901422403, + "learning_rate": 4.71551407784057e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 3.78125, + "logps/chosen": -615.0, + "logps/rejected": -539.5, + "loss": 0.4198, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.818359375, + "rewards/margins": 6.2734375, + "rewards/rejected": -4.4609375, + "step": 3026 + }, + { + "epoch": 0.600625031003522, + "grad_norm": 30.543900160061458, + "learning_rate": 4.7124453314862867e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 3.85546875, + "logps/chosen": -1101.0, + "logps/rejected": -752.0, + "loss": 0.3683, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0703125, + "rewards/margins": 8.09375, + "rewards/rejected": -5.0078125, + "step": 3027 + }, + { + "epoch": 0.6008234535443226, + "grad_norm": 36.42550107273664, + "learning_rate": 4.70937696290799e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.1640625, + "logps/chosen": -989.0, + "logps/rejected": -708.0, + "loss": 0.4856, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.802734375, + "rewards/margins": 7.7890625, + "rewards/rejected": -5.984375, + "step": 3028 + }, + { + "epoch": 0.6010218760851233, + "grad_norm": 40.31040696593962, + "learning_rate": 4.70630897357752e-07, + "logits/chosen": 4.125, + "logits/rejected": 3.84375, + "logps/chosen": -979.0, + "logps/rejected": -940.0, + "loss": 0.3387, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.337890625, + "rewards/margins": 9.3046875, + "rewards/rejected": -6.96875, + "step": 3029 + }, + { + "epoch": 0.6012202986259239, + "grad_norm": 23.994251797262237, + "learning_rate": 4.703241364966536e-07, + "logits/chosen": 3.8828125, + "logits/rejected": 3.7421875, + "logps/chosen": -1022.0, + "logps/rejected": -777.0, + "loss": 0.2781, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1171875, + "rewards/margins": 9.375, + "rewards/rejected": -6.265625, + "step": 3030 + }, + { + "epoch": 0.6014187211667246, + "grad_norm": 38.6566853856437, + "learning_rate": 4.7001741385465165e-07, + "logits/chosen": 3.80859375, + "logits/rejected": 3.50390625, + "logps/chosen": -1301.0, + "logps/rejected": -846.5, + "loss": 0.491, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8046875, + "rewards/margins": 6.87109375, + "rewards/rejected": -4.060546875, + "step": 3031 + }, + { + "epoch": 0.6016171437075252, + "grad_norm": 33.8835320704496, + "learning_rate": 4.697107295788755e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 3.921875, + "logps/chosen": -825.0, + "logps/rejected": -730.0, + "loss": 0.4072, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.87890625, + "rewards/margins": 7.078125, + "rewards/rejected": -5.2109375, + "step": 3032 + }, + { + "epoch": 0.6018155662483258, + "grad_norm": 30.515970265670422, + "learning_rate": 4.6940408381643626e-07, + "logits/chosen": 4.32421875, + "logits/rejected": 4.2265625, + "logps/chosen": -1264.0, + "logps/rejected": -712.0, + "loss": 0.3601, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.068359375, + "rewards/margins": 8.1640625, + "rewards/rejected": -5.1015625, + "step": 3033 + }, + { + "epoch": 0.6020139887891265, + "grad_norm": 38.21542593808292, + "learning_rate": 4.6909747671442633e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 3.875, + "logps/chosen": -997.0, + "logps/rejected": -834.0, + "loss": 0.5161, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.97265625, + "rewards/margins": 6.2890625, + "rewards/rejected": -3.3203125, + "step": 3034 + }, + { + "epoch": 0.6022124113299271, + "grad_norm": 32.70862165580282, + "learning_rate": 4.687909084199198e-07, + "logits/chosen": 4.453125, + "logits/rejected": 4.38671875, + "logps/chosen": -972.0, + "logps/rejected": -653.5, + "loss": 0.4569, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.806640625, + "rewards/margins": 6.61328125, + "rewards/rejected": -3.806640625, + "step": 3035 + }, + { + "epoch": 0.6024108338707277, + "grad_norm": 32.20732292179224, + "learning_rate": 4.6848437907997185e-07, + "logits/chosen": 3.828125, + "logits/rejected": 3.31640625, + "logps/chosen": -1149.0, + "logps/rejected": -750.0, + "loss": 0.289, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.90234375, + "rewards/margins": 8.015625, + "rewards/rejected": -5.109375, + "step": 3036 + }, + { + "epoch": 0.6026092564115284, + "grad_norm": 26.570808669226874, + "learning_rate": 4.6817788884161915e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 3.71484375, + "logps/chosen": -1366.0, + "logps/rejected": -698.0, + "loss": 0.2623, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.20703125, + "rewards/margins": 9.296875, + "rewards/rejected": -6.078125, + "step": 3037 + }, + { + "epoch": 0.602807678952329, + "grad_norm": 32.737395032851445, + "learning_rate": 4.6787143785187987e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.1484375, + "logps/chosen": -793.5, + "logps/rejected": -500.5, + "loss": 0.4487, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4609375, + "rewards/margins": 5.75, + "rewards/rejected": -3.279296875, + "step": 3038 + }, + { + "epoch": 0.6030061014931296, + "grad_norm": 21.708849721799897, + "learning_rate": 4.67565026257753e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.12890625, + "logps/chosen": -1120.0, + "logps/rejected": -922.0, + "loss": 0.3632, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.904296875, + "rewards/margins": 17.0703125, + "rewards/rejected": -14.125, + "step": 3039 + }, + { + "epoch": 0.6032045240339302, + "grad_norm": 30.308736273447906, + "learning_rate": 4.672586542062188e-07, + "logits/chosen": 3.7265625, + "logits/rejected": 3.55859375, + "logps/chosen": -870.0, + "logps/rejected": -897.0, + "loss": 0.3576, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.265625, + "rewards/margins": 17.3828125, + "rewards/rejected": -15.1640625, + "step": 3040 + }, + { + "epoch": 0.6034029465747309, + "grad_norm": 29.841862074123103, + "learning_rate": 4.6695232184423825e-07, + "logits/chosen": 4.4140625, + "logits/rejected": 4.5546875, + "logps/chosen": -710.0, + "logps/rejected": -607.0, + "loss": 0.3357, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7890625, + "rewards/margins": 6.953125, + "rewards/rejected": -4.16015625, + "step": 3041 + }, + { + "epoch": 0.6036013691155315, + "grad_norm": 31.818165117372516, + "learning_rate": 4.6664602931875386e-07, + "logits/chosen": 4.36328125, + "logits/rejected": 4.296875, + "logps/chosen": -1090.0, + "logps/rejected": -816.5, + "loss": 0.3164, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.568359375, + "rewards/margins": 8.46875, + "rewards/rejected": -5.8984375, + "step": 3042 + }, + { + "epoch": 0.6037997916563321, + "grad_norm": 35.767063849068066, + "learning_rate": 4.6633977677668846e-07, + "logits/chosen": 3.47265625, + "logits/rejected": 3.40625, + "logps/chosen": -1116.0, + "logps/rejected": -856.0, + "loss": 0.5263, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.357421875, + "rewards/margins": 6.46875, + "rewards/rejected": -4.109375, + "step": 3043 + }, + { + "epoch": 0.6039982141971328, + "grad_norm": 30.89386094787448, + "learning_rate": 4.660335643649462e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 3.9375, + "logps/chosen": -1039.5, + "logps/rejected": -768.0, + "loss": 0.3133, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.380859375, + "rewards/margins": 8.5390625, + "rewards/rejected": -6.15625, + "step": 3044 + }, + { + "epoch": 0.6041966367379334, + "grad_norm": 34.30957288959425, + "learning_rate": 4.657273922304115e-07, + "logits/chosen": 4.3828125, + "logits/rejected": 4.30078125, + "logps/chosen": -842.0, + "logps/rejected": -460.0, + "loss": 0.5011, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.482421875, + "rewards/margins": 6.2109375, + "rewards/rejected": -3.7265625, + "step": 3045 + }, + { + "epoch": 0.604395059278734, + "grad_norm": 35.96509100642037, + "learning_rate": 4.6542126051994965e-07, + "logits/chosen": 3.74609375, + "logits/rejected": 3.98828125, + "logps/chosen": -797.0, + "logps/rejected": -524.5, + "loss": 0.4605, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.794921875, + "rewards/margins": 5.8125, + "rewards/rejected": -3.0107421875, + "step": 3046 + }, + { + "epoch": 0.6045934818195347, + "grad_norm": 40.63374767740291, + "learning_rate": 4.6511516938040686e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.14453125, + "logps/chosen": -1242.0, + "logps/rejected": -1028.0, + "loss": 0.3934, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.662109375, + "rewards/margins": 8.2265625, + "rewards/rejected": -5.56640625, + "step": 3047 + }, + { + "epoch": 0.6047919043603354, + "grad_norm": 27.573622567677052, + "learning_rate": 4.648091189586094e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.1640625, + "logps/chosen": -1086.0, + "logps/rejected": -750.0, + "loss": 0.2599, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.33203125, + "rewards/margins": 9.6875, + "rewards/rejected": -6.34375, + "step": 3048 + }, + { + "epoch": 0.604990326901136, + "grad_norm": 32.60028956505695, + "learning_rate": 4.645031094013646e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.28125, + "logps/chosen": -1148.0, + "logps/rejected": -1094.0, + "loss": 0.3971, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.677734375, + "rewards/margins": 9.22265625, + "rewards/rejected": -6.5546875, + "step": 3049 + }, + { + "epoch": 0.6051887494419366, + "grad_norm": 47.590176556565176, + "learning_rate": 4.641971408554592e-07, + "logits/chosen": 4.390625, + "logits/rejected": 4.5546875, + "logps/chosen": -1106.0, + "logps/rejected": -1310.0, + "loss": 0.3878, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.8955078125, + "rewards/margins": 9.75, + "rewards/rejected": -7.859375, + "step": 3050 + }, + { + "epoch": 0.6053871719827373, + "grad_norm": 33.63474804704652, + "learning_rate": 4.6389121346766136e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.16796875, + "logps/chosen": -883.5, + "logps/rejected": -719.5, + "loss": 0.4382, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.857421875, + "rewards/margins": 6.75, + "rewards/rejected": -4.90234375, + "step": 3051 + }, + { + "epoch": 0.6055855945235379, + "grad_norm": 38.833460178182264, + "learning_rate": 4.635853273847188e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 3.96875, + "logps/chosen": -1009.0, + "logps/rejected": -1219.0, + "loss": 0.4991, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.888671875, + "rewards/margins": 7.74609375, + "rewards/rejected": -5.861328125, + "step": 3052 + }, + { + "epoch": 0.6057840170643385, + "grad_norm": 31.598987761884796, + "learning_rate": 4.632794827533596e-07, + "logits/chosen": 4.4375, + "logits/rejected": 4.26171875, + "logps/chosen": -1085.0, + "logps/rejected": -759.0, + "loss": 0.3726, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.15234375, + "rewards/margins": 7.8515625, + "rewards/rejected": -4.703125, + "step": 3053 + }, + { + "epoch": 0.6059824396051391, + "grad_norm": 30.903636934724787, + "learning_rate": 4.6297367972029223e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.96484375, + "logps/chosen": -956.0, + "logps/rejected": -736.0, + "loss": 0.3862, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.1787109375, + "rewards/margins": 7.12890625, + "rewards/rejected": -4.9375, + "step": 3054 + }, + { + "epoch": 0.6061808621459398, + "grad_norm": 39.78162582169831, + "learning_rate": 4.6266791843220467e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 4.02734375, + "logps/chosen": -854.0, + "logps/rejected": -636.0, + "loss": 0.467, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.527587890625, + "rewards/margins": 7.453125, + "rewards/rejected": -5.9296875, + "step": 3055 + }, + { + "epoch": 0.6063792846867404, + "grad_norm": 37.442591683346905, + "learning_rate": 4.6236219903576554e-07, + "logits/chosen": 4.015625, + "logits/rejected": 3.90625, + "logps/chosen": -1099.5, + "logps/rejected": -907.5, + "loss": 0.3425, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.015625, + "rewards/margins": 9.7734375, + "rewards/rejected": -6.7578125, + "step": 3056 + }, + { + "epoch": 0.606577707227541, + "grad_norm": 32.07796733255717, + "learning_rate": 4.620565216776225e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.3046875, + "logps/chosen": -868.5, + "logps/rejected": -997.0, + "loss": 0.5124, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.19140625, + "rewards/margins": 7.6640625, + "rewards/rejected": -5.484375, + "step": 3057 + }, + { + "epoch": 0.6067761297683417, + "grad_norm": 36.508821587119336, + "learning_rate": 4.617508865044042e-07, + "logits/chosen": 4.40625, + "logits/rejected": 3.9609375, + "logps/chosen": -1111.0, + "logps/rejected": -849.0, + "loss": 0.3772, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.83984375, + "rewards/margins": 8.4453125, + "rewards/rejected": -5.625, + "step": 3058 + }, + { + "epoch": 0.6069745523091423, + "grad_norm": 34.16902612717884, + "learning_rate": 4.614452936627178e-07, + "logits/chosen": 3.984375, + "logits/rejected": 4.2109375, + "logps/chosen": -887.5, + "logps/rejected": -654.5, + "loss": 0.6091, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.5361328125, + "rewards/margins": 6.21875, + "rewards/rejected": -4.6875, + "step": 3059 + }, + { + "epoch": 0.6071729748499429, + "grad_norm": 35.275151578975986, + "learning_rate": 4.6113974329915126e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 4.2265625, + "logps/chosen": -898.5, + "logps/rejected": -849.5, + "loss": 0.4419, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.173828125, + "rewards/margins": 7.1953125, + "rewards/rejected": -5.015625, + "step": 3060 + }, + { + "epoch": 0.6073713973907436, + "grad_norm": 36.86902956090426, + "learning_rate": 4.6083423556027115e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 3.875, + "logps/chosen": -970.0, + "logps/rejected": -592.5, + "loss": 0.4687, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.275390625, + "rewards/margins": 5.96484375, + "rewards/rejected": -3.6875, + "step": 3061 + }, + { + "epoch": 0.6075698199315442, + "grad_norm": 21.54246009222317, + "learning_rate": 4.605287705926244e-07, + "logits/chosen": 4.23046875, + "logits/rejected": 3.89453125, + "logps/chosen": -1128.0, + "logps/rejected": -719.5, + "loss": 0.3655, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.421875, + "rewards/margins": 7.6796875, + "rewards/rejected": -5.2578125, + "step": 3062 + }, + { + "epoch": 0.6077682424723448, + "grad_norm": 34.23146131663909, + "learning_rate": 4.6022334854273715e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.08984375, + "logps/chosen": -1107.0, + "logps/rejected": -1449.5, + "loss": 0.3823, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.580078125, + "rewards/margins": 10.4140625, + "rewards/rejected": -7.8515625, + "step": 3063 + }, + { + "epoch": 0.6079666650131454, + "grad_norm": 30.386159295242052, + "learning_rate": 4.599179695571148e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.09765625, + "logps/chosen": -1023.5, + "logps/rejected": -1296.0, + "loss": 0.3446, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.3203125, + "rewards/margins": 10.125, + "rewards/rejected": -7.78515625, + "step": 3064 + }, + { + "epoch": 0.6081650875539462, + "grad_norm": 30.125058525937042, + "learning_rate": 4.596126337822426e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 3.80859375, + "logps/chosen": -1192.0, + "logps/rejected": -828.0, + "loss": 0.3263, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.98046875, + "rewards/margins": 8.1015625, + "rewards/rejected": -5.11328125, + "step": 3065 + }, + { + "epoch": 0.6083635100947468, + "grad_norm": 25.457667501020772, + "learning_rate": 4.5930734136458404e-07, + "logits/chosen": 4.09375, + "logits/rejected": 4.37890625, + "logps/chosen": -740.0, + "logps/rejected": -648.5, + "loss": 0.4473, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.333984375, + "rewards/margins": 8.1328125, + "rewards/rejected": -5.7890625, + "step": 3066 + }, + { + "epoch": 0.6085619326355474, + "grad_norm": 27.007159458967568, + "learning_rate": 4.59002092450583e-07, + "logits/chosen": 4.71875, + "logits/rejected": 4.6328125, + "logps/chosen": -1272.0, + "logps/rejected": -951.0, + "loss": 0.3111, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.58984375, + "rewards/margins": 9.484375, + "rewards/rejected": -5.8828125, + "step": 3067 + }, + { + "epoch": 0.6087603551763481, + "grad_norm": 38.53647393991096, + "learning_rate": 4.5869688718666164e-07, + "logits/chosen": 3.76953125, + "logits/rejected": 3.89453125, + "logps/chosen": -967.0, + "logps/rejected": -888.0, + "loss": 0.4263, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3828125, + "rewards/margins": 7.15625, + "rewards/rejected": -4.77734375, + "step": 3068 + }, + { + "epoch": 0.6089587777171487, + "grad_norm": 42.26591185463151, + "learning_rate": 4.583917257192217e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 3.94921875, + "logps/chosen": -1043.0, + "logps/rejected": -908.5, + "loss": 0.3649, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.966796875, + "rewards/margins": 8.3515625, + "rewards/rejected": -6.3984375, + "step": 3069 + }, + { + "epoch": 0.6091572002579493, + "grad_norm": 23.35857554848745, + "learning_rate": 4.5808660819464396e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 3.8984375, + "logps/chosen": -1085.0, + "logps/rejected": -712.0, + "loss": 0.4362, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.017578125, + "rewards/margins": 7.0859375, + "rewards/rejected": -4.078125, + "step": 3070 + }, + { + "epoch": 0.6093556227987499, + "grad_norm": 34.683968653512025, + "learning_rate": 4.577815347592873e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 4.55078125, + "logps/chosen": -1075.0, + "logps/rejected": -712.0, + "loss": 0.4864, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.9681396484375, + "rewards/margins": 6.69140625, + "rewards/rejected": -4.72265625, + "step": 3071 + }, + { + "epoch": 0.6095540453395506, + "grad_norm": 41.38885988130224, + "learning_rate": 4.5747650555949037e-07, + "logits/chosen": 4.19921875, + "logits/rejected": 4.20703125, + "logps/chosen": -907.0, + "logps/rejected": -836.0, + "loss": 0.4351, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.888671875, + "rewards/margins": 6.8203125, + "rewards/rejected": -4.94140625, + "step": 3072 + }, + { + "epoch": 0.6097524678803512, + "grad_norm": 27.034151144126213, + "learning_rate": 4.5717152074157016e-07, + "logits/chosen": 3.78515625, + "logits/rejected": 3.63671875, + "logps/chosen": -904.0, + "logps/rejected": -778.0, + "loss": 0.3792, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.931640625, + "rewards/margins": 7.91015625, + "rewards/rejected": -4.9921875, + "step": 3073 + }, + { + "epoch": 0.6099508904211518, + "grad_norm": 31.94891248947648, + "learning_rate": 4.568665804518227e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 3.86328125, + "logps/chosen": -1178.0, + "logps/rejected": -803.0, + "loss": 0.3733, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.56591796875, + "rewards/margins": 10.2265625, + "rewards/rejected": -8.671875, + "step": 3074 + }, + { + "epoch": 0.6101493129619525, + "grad_norm": 31.090018897933028, + "learning_rate": 4.5656168483652204e-07, + "logits/chosen": 4.078125, + "logits/rejected": 3.90234375, + "logps/chosen": -1081.0, + "logps/rejected": -748.0, + "loss": 0.4692, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4091796875, + "rewards/margins": 6.60546875, + "rewards/rejected": -4.19140625, + "step": 3075 + }, + { + "epoch": 0.6103477355027531, + "grad_norm": 34.96261906547331, + "learning_rate": 4.562568340419214e-07, + "logits/chosen": 3.48828125, + "logits/rejected": 3.65234375, + "logps/chosen": -812.5, + "logps/rejected": -535.5, + "loss": 0.4492, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4609375, + "rewards/margins": 6.2421875, + "rewards/rejected": -3.7734375, + "step": 3076 + }, + { + "epoch": 0.6105461580435537, + "grad_norm": 30.866312494865657, + "learning_rate": 4.5595202821425227e-07, + "logits/chosen": 3.66796875, + "logits/rejected": 3.66015625, + "logps/chosen": -1155.0, + "logps/rejected": -1018.0, + "loss": 0.3081, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1796875, + "rewards/margins": 10.3828125, + "rewards/rejected": -7.1953125, + "step": 3077 + }, + { + "epoch": 0.6107445805843544, + "grad_norm": 33.61212641611908, + "learning_rate": 4.556472674997246e-07, + "logits/chosen": 3.73828125, + "logits/rejected": 4.3046875, + "logps/chosen": -988.0, + "logps/rejected": -837.0, + "loss": 0.5835, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.51513671875, + "rewards/margins": 6.296875, + "rewards/rejected": -4.78125, + "step": 3078 + }, + { + "epoch": 0.610943003125155, + "grad_norm": 27.6427616776932, + "learning_rate": 4.55342552044527e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.43359375, + "logps/chosen": -1004.0, + "logps/rejected": -663.0, + "loss": 0.3773, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.650390625, + "rewards/margins": 6.953125, + "rewards/rejected": -4.3046875, + "step": 3079 + }, + { + "epoch": 0.6111414256659556, + "grad_norm": 34.42573576355121, + "learning_rate": 4.550378819948254e-07, + "logits/chosen": 3.5, + "logits/rejected": 3.34765625, + "logps/chosen": -1158.0, + "logps/rejected": -645.0, + "loss": 0.4146, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.69140625, + "rewards/margins": 8.1484375, + "rewards/rejected": -5.4453125, + "step": 3080 + }, + { + "epoch": 0.6113398482067562, + "grad_norm": 35.436277473713375, + "learning_rate": 4.547332574967653e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.53125, + "logps/chosen": -909.0, + "logps/rejected": -696.0, + "loss": 0.4109, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3828125, + "rewards/margins": 7.1328125, + "rewards/rejected": -4.75, + "step": 3081 + }, + { + "epoch": 0.611538270747557, + "grad_norm": 30.013777755249222, + "learning_rate": 4.544286786964691e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 3.890625, + "logps/chosen": -1103.0, + "logps/rejected": -723.0, + "loss": 0.3209, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9609375, + "rewards/margins": 8.5234375, + "rewards/rejected": -5.5546875, + "step": 3082 + }, + { + "epoch": 0.6117366932883576, + "grad_norm": 36.78519035481843, + "learning_rate": 4.541241457400381e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.1171875, + "logps/chosen": -807.0, + "logps/rejected": -722.0, + "loss": 0.5034, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.017578125, + "rewards/margins": 5.47265625, + "rewards/rejected": -3.443359375, + "step": 3083 + }, + { + "epoch": 0.6119351158291582, + "grad_norm": 33.38828595681617, + "learning_rate": 4.538196587735512e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.4765625, + "logps/chosen": -916.0, + "logps/rejected": -770.5, + "loss": 0.4126, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.55859375, + "rewards/margins": 6.4921875, + "rewards/rejected": -3.93359375, + "step": 3084 + }, + { + "epoch": 0.6121335383699589, + "grad_norm": 31.91799340739661, + "learning_rate": 4.535152179430656e-07, + "logits/chosen": 3.8046875, + "logits/rejected": 3.7265625, + "logps/chosen": -896.0, + "logps/rejected": -632.0, + "loss": 0.3684, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.05859375, + "rewards/margins": 7.53125, + "rewards/rejected": -5.4765625, + "step": 3085 + }, + { + "epoch": 0.6123319609107595, + "grad_norm": 37.89910405467835, + "learning_rate": 4.532108233946162e-07, + "logits/chosen": 3.6796875, + "logits/rejected": 4.1015625, + "logps/chosen": -1010.0, + "logps/rejected": -2077.0, + "loss": 0.4203, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.5673828125, + "rewards/margins": 10.34375, + "rewards/rejected": -8.78125, + "step": 3086 + }, + { + "epoch": 0.6125303834515601, + "grad_norm": 36.08785310146455, + "learning_rate": 4.5290647527421545e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.98046875, + "logps/chosen": -592.5, + "logps/rejected": -576.0, + "loss": 0.4791, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.06494140625, + "rewards/margins": 6.08203125, + "rewards/rejected": -4.01171875, + "step": 3087 + }, + { + "epoch": 0.6127288059923607, + "grad_norm": 33.3163741581358, + "learning_rate": 4.526021737278537e-07, + "logits/chosen": 4.390625, + "logits/rejected": 4.2890625, + "logps/chosen": -759.5, + "logps/rejected": -725.0, + "loss": 0.5709, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.6083984375, + "rewards/margins": 5.23046875, + "rewards/rejected": -3.6328125, + "step": 3088 + }, + { + "epoch": 0.6129272285331614, + "grad_norm": 23.852561463526506, + "learning_rate": 4.5229791890149914e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.2265625, + "logps/chosen": -1016.0, + "logps/rejected": -657.5, + "loss": 0.3175, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.94140625, + "rewards/margins": 8.359375, + "rewards/rejected": -5.4140625, + "step": 3089 + }, + { + "epoch": 0.613125651073962, + "grad_norm": 30.226031109314142, + "learning_rate": 4.519937109410976e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.9453125, + "logps/chosen": -819.0, + "logps/rejected": -833.0, + "loss": 0.4883, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.0302734375, + "rewards/margins": 7.21875, + "rewards/rejected": -5.19921875, + "step": 3090 + }, + { + "epoch": 0.6133240736147626, + "grad_norm": 30.970533755561764, + "learning_rate": 4.516895499925718e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 4.19140625, + "logps/chosen": -633.5, + "logps/rejected": -732.0, + "loss": 0.4708, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.92578125, + "rewards/margins": 7.5, + "rewards/rejected": -5.5859375, + "step": 3091 + }, + { + "epoch": 0.6135224961555633, + "grad_norm": 41.541628346606764, + "learning_rate": 4.513854362018226e-07, + "logits/chosen": 4.0625, + "logits/rejected": 4.2109375, + "logps/chosen": -1186.0, + "logps/rejected": -867.5, + "loss": 0.3819, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.53125, + "rewards/margins": 9.625, + "rewards/rejected": -7.08984375, + "step": 3092 + }, + { + "epoch": 0.6137209186963639, + "grad_norm": 31.258793889276618, + "learning_rate": 4.5108136971472775e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.046875, + "logps/chosen": -956.0, + "logps/rejected": -627.0, + "loss": 0.4491, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.076171875, + "rewards/margins": 6.1328125, + "rewards/rejected": -4.0625, + "step": 3093 + }, + { + "epoch": 0.6139193412371645, + "grad_norm": 38.44963882780189, + "learning_rate": 4.5077735067714295e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 4.08203125, + "logps/chosen": -995.5, + "logps/rejected": -668.0, + "loss": 0.56, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.54248046875, + "rewards/margins": 7.0625, + "rewards/rejected": -4.5185546875, + "step": 3094 + }, + { + "epoch": 0.6141177637779652, + "grad_norm": 25.68943618472502, + "learning_rate": 4.5047337923490056e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 4.03515625, + "logps/chosen": -851.0, + "logps/rejected": -699.0, + "loss": 0.4822, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.63671875, + "rewards/margins": 7.015625, + "rewards/rejected": -4.390625, + "step": 3095 + }, + { + "epoch": 0.6143161863187658, + "grad_norm": 35.86985369571956, + "learning_rate": 4.5016945553381e-07, + "logits/chosen": 3.671875, + "logits/rejected": 3.87890625, + "logps/chosen": -826.0, + "logps/rejected": -756.0, + "loss": 0.5526, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.681640625, + "rewards/margins": 7.1640625, + "rewards/rejected": -4.48046875, + "step": 3096 + }, + { + "epoch": 0.6145146088595664, + "grad_norm": 29.290763268148925, + "learning_rate": 4.4986557971965856e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 3.91796875, + "logps/chosen": -893.0, + "logps/rejected": -723.0, + "loss": 0.364, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.0810546875, + "rewards/margins": 7.953125, + "rewards/rejected": -5.890625, + "step": 3097 + }, + { + "epoch": 0.614713031400367, + "grad_norm": 37.6021837098298, + "learning_rate": 4.4956175193820965e-07, + "logits/chosen": 3.83984375, + "logits/rejected": 3.953125, + "logps/chosen": -865.5, + "logps/rejected": -1012.0, + "loss": 0.3833, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1953125, + "rewards/margins": 7.8984375, + "rewards/rejected": -5.70703125, + "step": 3098 + }, + { + "epoch": 0.6149114539411678, + "grad_norm": 30.03344531105555, + "learning_rate": 4.4925797233520433e-07, + "logits/chosen": 3.57421875, + "logits/rejected": 3.546875, + "logps/chosen": -963.0, + "logps/rejected": -1446.0, + "loss": 0.326, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.43218994140625, + "rewards/margins": 14.515625, + "rewards/rejected": -12.0703125, + "step": 3099 + }, + { + "epoch": 0.6151098764819684, + "grad_norm": 42.57117673512975, + "learning_rate": 4.4895424105636e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 3.9921875, + "logps/chosen": -1031.0, + "logps/rejected": -679.0, + "loss": 0.483, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.697265625, + "rewards/margins": 7.953125, + "rewards/rejected": -6.2578125, + "step": 3100 + }, + { + "epoch": 0.615308299022769, + "grad_norm": 50.87015970539097, + "learning_rate": 4.486505582473714e-07, + "logits/chosen": 3.63671875, + "logits/rejected": 3.72265625, + "logps/chosen": -914.0, + "logps/rejected": -894.0, + "loss": 0.4431, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.166015625, + "rewards/margins": 8.01171875, + "rewards/rejected": -5.84765625, + "step": 3101 + }, + { + "epoch": 0.6155067215635697, + "grad_norm": 25.94308944455255, + "learning_rate": 4.4834692405390997e-07, + "logits/chosen": 4.31640625, + "logits/rejected": 3.73828125, + "logps/chosen": -753.0, + "logps/rejected": -666.0, + "loss": 0.431, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.576171875, + "rewards/margins": 6.71875, + "rewards/rejected": -4.12890625, + "step": 3102 + }, + { + "epoch": 0.6157051441043703, + "grad_norm": 27.84976622535786, + "learning_rate": 4.4804333862162305e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 3.96484375, + "logps/chosen": -918.0, + "logps/rejected": -531.0, + "loss": 0.3994, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.78125, + "rewards/margins": 7.84375, + "rewards/rejected": -5.0859375, + "step": 3103 + }, + { + "epoch": 0.6159035666451709, + "grad_norm": 34.57845611835241, + "learning_rate": 4.4773980209613557e-07, + "logits/chosen": 4.5078125, + "logits/rejected": 4.453125, + "logps/chosen": -1153.0, + "logps/rejected": -1570.0, + "loss": 0.3778, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.814453125, + "rewards/margins": 12.1640625, + "rewards/rejected": -10.36328125, + "step": 3104 + }, + { + "epoch": 0.6161019891859715, + "grad_norm": 38.57684392264814, + "learning_rate": 4.4743631462304865e-07, + "logits/chosen": 3.77734375, + "logits/rejected": 3.859375, + "logps/chosen": -919.0, + "logps/rejected": -1580.0, + "loss": 0.466, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.025390625, + "rewards/margins": 8.2109375, + "rewards/rejected": -6.1875, + "step": 3105 + }, + { + "epoch": 0.6163004117267722, + "grad_norm": 26.243708693971296, + "learning_rate": 4.471328763479397e-07, + "logits/chosen": 4.6875, + "logits/rejected": 4.609375, + "logps/chosen": -1069.0, + "logps/rejected": -1046.0, + "loss": 0.2468, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5546875, + "rewards/margins": 10.1640625, + "rewards/rejected": -7.60546875, + "step": 3106 + }, + { + "epoch": 0.6164988342675728, + "grad_norm": 26.81639714569637, + "learning_rate": 4.4682948741636273e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 3.9609375, + "logps/chosen": -1064.0, + "logps/rejected": -893.0, + "loss": 0.305, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2109375, + "rewards/margins": 10.4609375, + "rewards/rejected": -7.2578125, + "step": 3107 + }, + { + "epoch": 0.6166972568083734, + "grad_norm": 38.579597553796354, + "learning_rate": 4.4652614797384793e-07, + "logits/chosen": 3.6640625, + "logits/rejected": 3.8515625, + "logps/chosen": -981.0, + "logps/rejected": -809.0, + "loss": 0.491, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.3828125, + "rewards/margins": 6.3984375, + "rewards/rejected": -4.015625, + "step": 3108 + }, + { + "epoch": 0.6168956793491741, + "grad_norm": 33.84903115693497, + "learning_rate": 4.4622285816590186e-07, + "logits/chosen": 3.6796875, + "logits/rejected": 3.6171875, + "logps/chosen": -816.0, + "logps/rejected": -787.0, + "loss": 0.5011, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.09375, + "rewards/margins": 6.94921875, + "rewards/rejected": -4.84765625, + "step": 3109 + }, + { + "epoch": 0.6170941018899747, + "grad_norm": 24.921547551094108, + "learning_rate": 4.459196181380074e-07, + "logits/chosen": 4.29296875, + "logits/rejected": 4.421875, + "logps/chosen": -1141.0, + "logps/rejected": -913.0, + "loss": 0.2921, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.375, + "rewards/margins": 10.453125, + "rewards/rejected": -7.0625, + "step": 3110 + }, + { + "epoch": 0.6172925244307753, + "grad_norm": 42.74744658996315, + "learning_rate": 4.456164280356235e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 4.171875, + "logps/chosen": -783.0, + "logps/rejected": -669.5, + "loss": 0.4902, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.90234375, + "rewards/margins": 6.43359375, + "rewards/rejected": -4.5390625, + "step": 3111 + }, + { + "epoch": 0.6174909469715759, + "grad_norm": 23.67700560969418, + "learning_rate": 4.453132880041845e-07, + "logits/chosen": 4.265625, + "logits/rejected": 3.9609375, + "logps/chosen": -933.0, + "logps/rejected": -688.0, + "loss": 0.3922, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.830078125, + "rewards/margins": 7.140625, + "rewards/rejected": -4.3125, + "step": 3112 + }, + { + "epoch": 0.6176893695123766, + "grad_norm": 21.45067723723794, + "learning_rate": 4.450101981891019e-07, + "logits/chosen": 3.953125, + "logits/rejected": 3.921875, + "logps/chosen": -1062.0, + "logps/rejected": -501.0, + "loss": 0.3169, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1484375, + "rewards/margins": 8.015625, + "rewards/rejected": -4.86328125, + "step": 3113 + }, + { + "epoch": 0.6178877920531772, + "grad_norm": 28.93898558762375, + "learning_rate": 4.447071587357621e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 3.94921875, + "logps/chosen": -1173.0, + "logps/rejected": -706.5, + "loss": 0.3402, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.3212890625, + "rewards/margins": 8.078125, + "rewards/rejected": -5.76953125, + "step": 3114 + }, + { + "epoch": 0.6180862145939778, + "grad_norm": 28.373823498350987, + "learning_rate": 4.444041697895282e-07, + "logits/chosen": 3.78125, + "logits/rejected": 3.6640625, + "logps/chosen": -866.0, + "logps/rejected": -477.0, + "loss": 0.4787, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.12255859375, + "rewards/margins": 5.6953125, + "rewards/rejected": -3.5810546875, + "step": 3115 + }, + { + "epoch": 0.6182846371347785, + "grad_norm": 29.719694963487523, + "learning_rate": 4.441012314957381e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.06640625, + "logps/chosen": -1135.0, + "logps/rejected": -721.0, + "loss": 0.3134, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.279296875, + "rewards/margins": 8.3515625, + "rewards/rejected": -5.08203125, + "step": 3116 + }, + { + "epoch": 0.6184830596755791, + "grad_norm": 23.682016213241045, + "learning_rate": 4.4379834399970604e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 4.296875, + "logps/chosen": -1175.0, + "logps/rejected": -1143.0, + "loss": 0.3665, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.361328125, + "rewards/margins": 8.953125, + "rewards/rejected": -6.6015625, + "step": 3117 + }, + { + "epoch": 0.6186814822163798, + "grad_norm": 34.18356364907679, + "learning_rate": 4.4349550744672213e-07, + "logits/chosen": 4.28515625, + "logits/rejected": 4.0625, + "logps/chosen": -842.0, + "logps/rejected": -606.5, + "loss": 0.4904, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.142578125, + "rewards/margins": 6.5703125, + "rewards/rejected": -4.42578125, + "step": 3118 + }, + { + "epoch": 0.6188799047571805, + "grad_norm": 31.51777767124971, + "learning_rate": 4.431927219820513e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.3828125, + "logps/chosen": -802.0, + "logps/rejected": -714.0, + "loss": 0.491, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.12890625, + "rewards/margins": 7.3203125, + "rewards/rejected": -5.1875, + "step": 3119 + }, + { + "epoch": 0.6190783272979811, + "grad_norm": 29.154215793183784, + "learning_rate": 4.4288998775093465e-07, + "logits/chosen": 3.83984375, + "logits/rejected": 3.78515625, + "logps/chosen": -794.5, + "logps/rejected": -587.0, + "loss": 0.4189, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.01953125, + "rewards/margins": 6.6953125, + "rewards/rejected": -4.671875, + "step": 3120 + }, + { + "epoch": 0.6192767498387817, + "grad_norm": 34.61478103913599, + "learning_rate": 4.42587304898588e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 3.73828125, + "logps/chosen": -1032.0, + "logps/rejected": -606.0, + "loss": 0.3398, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.13671875, + "rewards/margins": 8.1328125, + "rewards/rejected": -5.0, + "step": 3121 + }, + { + "epoch": 0.6194751723795823, + "grad_norm": 42.84036852653632, + "learning_rate": 4.422846735702035e-07, + "logits/chosen": 4.0625, + "logits/rejected": 4.13671875, + "logps/chosen": -942.0, + "logps/rejected": -814.0, + "loss": 0.4058, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.14453125, + "rewards/margins": 7.90625, + "rewards/rejected": -5.7734375, + "step": 3122 + }, + { + "epoch": 0.619673594920383, + "grad_norm": 31.470275199722423, + "learning_rate": 4.419820939109474e-07, + "logits/chosen": 4.3828125, + "logits/rejected": 4.25, + "logps/chosen": -678.0, + "logps/rejected": -520.0, + "loss": 0.3881, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0712890625, + "rewards/margins": 6.78125, + "rewards/rejected": -4.6953125, + "step": 3123 + }, + { + "epoch": 0.6198720174611836, + "grad_norm": 31.052853565352535, + "learning_rate": 4.4167956606596225e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 4.1015625, + "logps/chosen": -1205.0, + "logps/rejected": -2859.0, + "loss": 0.3306, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.42578125, + "rewards/margins": 17.90625, + "rewards/rejected": -14.4609375, + "step": 3124 + }, + { + "epoch": 0.6200704400019842, + "grad_norm": 35.98315264378893, + "learning_rate": 4.4137709018036495e-07, + "logits/chosen": 3.7265625, + "logits/rejected": 3.6953125, + "logps/chosen": -1038.0, + "logps/rejected": -789.0, + "loss": 0.5215, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.46337890625, + "rewards/margins": 6.13671875, + "rewards/rejected": -4.67578125, + "step": 3125 + }, + { + "epoch": 0.6202688625427849, + "grad_norm": 27.13158157268061, + "learning_rate": 4.410746663992481e-07, + "logits/chosen": 3.68359375, + "logits/rejected": 3.97265625, + "logps/chosen": -962.0, + "logps/rejected": -831.5, + "loss": 0.4145, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.482421875, + "rewards/margins": 7.4609375, + "rewards/rejected": -4.984375, + "step": 3126 + }, + { + "epoch": 0.6204672850835855, + "grad_norm": 42.771764329407425, + "learning_rate": 4.4077229486767906e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 4.1171875, + "logps/chosen": -1168.5, + "logps/rejected": -823.0, + "loss": 0.3136, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5546875, + "rewards/margins": 9.15625, + "rewards/rejected": -6.609375, + "step": 3127 + }, + { + "epoch": 0.6206657076243861, + "grad_norm": 26.793771560390873, + "learning_rate": 4.404699757306998e-07, + "logits/chosen": 3.7265625, + "logits/rejected": 3.48046875, + "logps/chosen": -973.0, + "logps/rejected": -1445.5, + "loss": 0.2957, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.759765625, + "rewards/margins": 10.0859375, + "rewards/rejected": -7.32421875, + "step": 3128 + }, + { + "epoch": 0.6208641301651867, + "grad_norm": 33.451241724156986, + "learning_rate": 4.401677091333277e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 4.27734375, + "logps/chosen": -1191.0, + "logps/rejected": -1264.0, + "loss": 0.2817, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.841796875, + "rewards/margins": 11.6328125, + "rewards/rejected": -8.7890625, + "step": 3129 + }, + { + "epoch": 0.6210625527059874, + "grad_norm": 30.610636966462348, + "learning_rate": 4.3986549522055425e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.02734375, + "logps/chosen": -774.0, + "logps/rejected": -660.0, + "loss": 0.4577, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8193359375, + "rewards/margins": 7.2421875, + "rewards/rejected": -5.4140625, + "step": 3130 + }, + { + "epoch": 0.621260975246788, + "grad_norm": 28.883305635717765, + "learning_rate": 4.395633341373467e-07, + "logits/chosen": 3.61328125, + "logits/rejected": 3.515625, + "logps/chosen": -852.0, + "logps/rejected": -903.0, + "loss": 0.6303, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.5810546875, + "rewards/margins": 6.953125, + "rewards/rejected": -5.3671875, + "step": 3131 + }, + { + "epoch": 0.6214593977875886, + "grad_norm": 25.91033188372077, + "learning_rate": 4.392612260286459e-07, + "logits/chosen": 3.67578125, + "logits/rejected": 3.6796875, + "logps/chosen": -960.0, + "logps/rejected": -681.5, + "loss": 0.4275, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.333984375, + "rewards/margins": 6.51171875, + "rewards/rejected": -4.18359375, + "step": 3132 + }, + { + "epoch": 0.6216578203283893, + "grad_norm": 36.29416880654996, + "learning_rate": 4.3895917103936783e-07, + "logits/chosen": 3.75, + "logits/rejected": 3.7734375, + "logps/chosen": -946.0, + "logps/rejected": -1191.0, + "loss": 0.4761, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.8232421875, + "rewards/margins": 8.58984375, + "rewards/rejected": -7.7578125, + "step": 3133 + }, + { + "epoch": 0.62185624286919, + "grad_norm": 34.2454648487012, + "learning_rate": 4.386571693144031e-07, + "logits/chosen": 4.14453125, + "logits/rejected": 4.4296875, + "logps/chosen": -1002.0, + "logps/rejected": -1368.0, + "loss": 0.3429, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.931640625, + "rewards/margins": 10.796875, + "rewards/rejected": -7.84375, + "step": 3134 + }, + { + "epoch": 0.6220546654099905, + "grad_norm": 23.235889141078648, + "learning_rate": 4.383552209986163e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.3671875, + "logps/chosen": -1047.0, + "logps/rejected": -826.0, + "loss": 0.3237, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.666015625, + "rewards/margins": 8.5546875, + "rewards/rejected": -5.8828125, + "step": 3135 + }, + { + "epoch": 0.6222530879507913, + "grad_norm": 31.464245316895614, + "learning_rate": 4.380533262368471e-07, + "logits/chosen": 4.09375, + "logits/rejected": 4.05859375, + "logps/chosen": -1135.0, + "logps/rejected": -799.0, + "loss": 0.4832, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.505859375, + "rewards/margins": 5.82421875, + "rewards/rejected": -3.31201171875, + "step": 3136 + }, + { + "epoch": 0.6224515104915919, + "grad_norm": 34.750030215698764, + "learning_rate": 4.3775148517390846e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 3.96484375, + "logps/chosen": -1091.0, + "logps/rejected": -733.5, + "loss": 0.4518, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.26953125, + "rewards/margins": 7.484375, + "rewards/rejected": -5.2109375, + "step": 3137 + }, + { + "epoch": 0.6226499330323925, + "grad_norm": 35.10766594929182, + "learning_rate": 4.374496979545886e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 4.0703125, + "logps/chosen": -735.0, + "logps/rejected": -774.0, + "loss": 0.4034, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.884765625, + "rewards/margins": 7.625, + "rewards/rejected": -5.7421875, + "step": 3138 + }, + { + "epoch": 0.6228483555731931, + "grad_norm": 36.83911379521741, + "learning_rate": 4.3714796472364925e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.09765625, + "logps/chosen": -744.5, + "logps/rejected": -753.5, + "loss": 0.4773, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.16015625, + "rewards/margins": 7.140625, + "rewards/rejected": -4.984375, + "step": 3139 + }, + { + "epoch": 0.6230467781139938, + "grad_norm": 25.563299521517663, + "learning_rate": 4.368462856258268e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 4.109375, + "logps/chosen": -1240.0, + "logps/rejected": -901.0, + "loss": 0.4064, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3125, + "rewards/margins": 9.109375, + "rewards/rejected": -5.8125, + "step": 3140 + }, + { + "epoch": 0.6232452006547944, + "grad_norm": 28.59103871745837, + "learning_rate": 4.365446608058311e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 3.9609375, + "logps/chosen": -1020.0, + "logps/rejected": -747.0, + "loss": 0.5223, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.798828125, + "rewards/margins": 5.6953125, + "rewards/rejected": -3.896484375, + "step": 3141 + }, + { + "epoch": 0.623443623195595, + "grad_norm": 28.339052910160838, + "learning_rate": 4.3624309040834604e-07, + "logits/chosen": 3.61328125, + "logits/rejected": 3.59375, + "logps/chosen": -973.0, + "logps/rejected": -1145.0, + "loss": 0.289, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2578125, + "rewards/margins": 10.453125, + "rewards/rejected": -7.1953125, + "step": 3142 + }, + { + "epoch": 0.6236420457363957, + "grad_norm": 35.645088805522406, + "learning_rate": 4.3594157457803007e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 3.9140625, + "logps/chosen": -620.0, + "logps/rejected": -540.0, + "loss": 0.4231, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.58984375, + "rewards/margins": 7.390625, + "rewards/rejected": -5.79296875, + "step": 3143 + }, + { + "epoch": 0.6238404682771963, + "grad_norm": 26.627306396759582, + "learning_rate": 4.3564011345951457e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.10546875, + "logps/chosen": -932.0, + "logps/rejected": -641.0, + "loss": 0.4336, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7890625, + "rewards/margins": 6.8359375, + "rewards/rejected": -4.0546875, + "step": 3144 + }, + { + "epoch": 0.6240388908179969, + "grad_norm": 28.773956874214292, + "learning_rate": 4.3533870719740553e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.265625, + "logps/chosen": -916.0, + "logps/rejected": -807.0, + "loss": 0.3347, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.26171875, + "rewards/margins": 7.734375, + "rewards/rejected": -4.4921875, + "step": 3145 + }, + { + "epoch": 0.6242373133587975, + "grad_norm": 42.81267629428633, + "learning_rate": 4.350373559362815e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.2109375, + "logps/chosen": -946.5, + "logps/rejected": -826.0, + "loss": 0.4825, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.501953125, + "rewards/margins": 7.2578125, + "rewards/rejected": -4.75, + "step": 3146 + }, + { + "epoch": 0.6244357358995982, + "grad_norm": 27.607553940106317, + "learning_rate": 4.3473605982069603e-07, + "logits/chosen": 4.0, + "logits/rejected": 4.10546875, + "logps/chosen": -681.0, + "logps/rejected": -582.5, + "loss": 0.4691, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.904296875, + "rewards/margins": 5.78125, + "rewards/rejected": -3.87890625, + "step": 3147 + }, + { + "epoch": 0.6246341584403988, + "grad_norm": 32.9350124278443, + "learning_rate": 4.344348189951751e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.08203125, + "logps/chosen": -1162.0, + "logps/rejected": -842.0, + "loss": 0.436, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5078125, + "rewards/margins": 8.4453125, + "rewards/rejected": -5.9296875, + "step": 3148 + }, + { + "epoch": 0.6248325809811994, + "grad_norm": 35.318990417147894, + "learning_rate": 4.341336336042185e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.01171875, + "logps/chosen": -990.0, + "logps/rejected": -759.0, + "loss": 0.3515, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.056640625, + "rewards/margins": 7.734375, + "rewards/rejected": -5.6796875, + "step": 3149 + }, + { + "epoch": 0.6250310035220001, + "grad_norm": 23.15644787067622, + "learning_rate": 4.3383250379229994e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.14453125, + "logps/chosen": -965.0, + "logps/rejected": -646.0, + "loss": 0.3652, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.828125, + "rewards/margins": 7.4921875, + "rewards/rejected": -4.6796875, + "step": 3150 + }, + { + "epoch": 0.6252294260628007, + "grad_norm": 30.373215908407662, + "learning_rate": 4.3353142970386557e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 4.08203125, + "logps/chosen": -1069.0, + "logps/rejected": -722.0, + "loss": 0.3727, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.9404296875, + "rewards/margins": 7.53125, + "rewards/rejected": -5.6015625, + "step": 3151 + }, + { + "epoch": 0.6254278486036013, + "grad_norm": 25.1364413082765, + "learning_rate": 4.332304114833358e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 3.8515625, + "logps/chosen": -963.5, + "logps/rejected": -881.0, + "loss": 0.4079, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.306640625, + "rewards/margins": 8.06640625, + "rewards/rejected": -5.755859375, + "step": 3152 + }, + { + "epoch": 0.6256262711444021, + "grad_norm": 31.626980315712707, + "learning_rate": 4.3292944927510324e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.4453125, + "logps/chosen": -738.5, + "logps/rejected": -613.0, + "loss": 0.4539, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.564453125, + "rewards/margins": 8.96875, + "rewards/rejected": -6.39453125, + "step": 3153 + }, + { + "epoch": 0.6258246936852027, + "grad_norm": 32.469105096102766, + "learning_rate": 4.3262854322353426e-07, + "logits/chosen": 4.15625, + "logits/rejected": 3.8125, + "logps/chosen": -893.0, + "logps/rejected": -568.0, + "loss": 0.4365, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.25390625, + "rewards/margins": 6.30859375, + "rewards/rejected": -4.048828125, + "step": 3154 + }, + { + "epoch": 0.6260231162260033, + "grad_norm": 33.09244825276915, + "learning_rate": 4.3232769347296827e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 3.91796875, + "logps/chosen": -1012.0, + "logps/rejected": -863.0, + "loss": 0.3513, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6796875, + "rewards/margins": 8.6328125, + "rewards/rejected": -5.94140625, + "step": 3155 + }, + { + "epoch": 0.6262215387668039, + "grad_norm": 38.77985039592766, + "learning_rate": 4.3202690016771757e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.0625, + "logps/chosen": -1246.0, + "logps/rejected": -816.0, + "loss": 0.4077, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9990234375, + "rewards/margins": 7.7109375, + "rewards/rejected": -4.71484375, + "step": 3156 + }, + { + "epoch": 0.6264199613076046, + "grad_norm": 28.791340004811218, + "learning_rate": 4.317261634520671e-07, + "logits/chosen": 3.76171875, + "logits/rejected": 4.08203125, + "logps/chosen": -1141.0, + "logps/rejected": -1194.0, + "loss": 0.3941, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.41796875, + "rewards/margins": 9.8359375, + "rewards/rejected": -7.39453125, + "step": 3157 + }, + { + "epoch": 0.6266183838484052, + "grad_norm": 31.24765723749263, + "learning_rate": 4.314254834702752e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 3.98046875, + "logps/chosen": -1030.0, + "logps/rejected": -781.0, + "loss": 0.4817, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.87109375, + "rewards/margins": 6.3671875, + "rewards/rejected": -4.49609375, + "step": 3158 + }, + { + "epoch": 0.6268168063892058, + "grad_norm": 29.96562684974623, + "learning_rate": 4.311248603665726e-07, + "logits/chosen": 3.828125, + "logits/rejected": 4.1953125, + "logps/chosen": -835.0, + "logps/rejected": -1836.0, + "loss": 0.4361, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.490234375, + "rewards/margins": 9.6171875, + "rewards/rejected": -7.1328125, + "step": 3159 + }, + { + "epoch": 0.6270152289300065, + "grad_norm": 32.54807464026178, + "learning_rate": 4.3082429428516275e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 3.703125, + "logps/chosen": -1316.0, + "logps/rejected": -674.0, + "loss": 0.3936, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.345703125, + "rewards/margins": 6.890625, + "rewards/rejected": -4.53515625, + "step": 3160 + }, + { + "epoch": 0.6272136514708071, + "grad_norm": 29.264424634053842, + "learning_rate": 4.3052378537022215e-07, + "logits/chosen": 3.796875, + "logits/rejected": 3.79296875, + "logps/chosen": -1035.0, + "logps/rejected": -1318.0, + "loss": 0.3772, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9140625, + "rewards/margins": 9.5078125, + "rewards/rejected": -6.60546875, + "step": 3161 + }, + { + "epoch": 0.6274120740116077, + "grad_norm": 32.482572210399255, + "learning_rate": 4.302233337658993e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 4.015625, + "logps/chosen": -852.0, + "logps/rejected": -861.0, + "loss": 0.3434, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.015625, + "rewards/margins": 7.109375, + "rewards/rejected": -5.09375, + "step": 3162 + }, + { + "epoch": 0.6276104965524083, + "grad_norm": 33.85798297961924, + "learning_rate": 4.2992293961631567e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.453125, + "logps/chosen": -1144.0, + "logps/rejected": -943.0, + "loss": 0.322, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.69140625, + "rewards/margins": 9.3671875, + "rewards/rejected": -5.671875, + "step": 3163 + }, + { + "epoch": 0.627808919093209, + "grad_norm": 29.605237515750662, + "learning_rate": 4.296226030655649e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.46484375, + "logps/chosen": -868.0, + "logps/rejected": -755.0, + "loss": 0.3068, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.46875, + "rewards/margins": 9.5234375, + "rewards/rejected": -6.0625, + "step": 3164 + }, + { + "epoch": 0.6280073416340096, + "grad_norm": 31.673190836920398, + "learning_rate": 4.293223242577131e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.73046875, + "logps/chosen": -875.0, + "logps/rejected": -770.0, + "loss": 0.4629, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3125, + "rewards/margins": 6.703125, + "rewards/rejected": -4.3984375, + "step": 3165 + }, + { + "epoch": 0.6282057641748102, + "grad_norm": 25.674345973093377, + "learning_rate": 4.290221033367989e-07, + "logits/chosen": 4.515625, + "logits/rejected": 4.4765625, + "logps/chosen": -1182.0, + "logps/rejected": -874.0, + "loss": 0.335, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1015625, + "rewards/margins": 9.3671875, + "rewards/rejected": -6.2734375, + "step": 3166 + }, + { + "epoch": 0.6284041867156109, + "grad_norm": 31.33774343787391, + "learning_rate": 4.2872194044683263e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 3.65625, + "logps/chosen": -1338.0, + "logps/rejected": -792.0, + "loss": 0.2641, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.546875, + "rewards/margins": 9.640625, + "rewards/rejected": -6.10546875, + "step": 3167 + }, + { + "epoch": 0.6286026092564115, + "grad_norm": 24.595691808522503, + "learning_rate": 4.2842183573179735e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.390625, + "logps/chosen": -1119.0, + "logps/rejected": -710.0, + "loss": 0.4076, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.875, + "rewards/margins": 8.140625, + "rewards/rejected": -5.2578125, + "step": 3168 + }, + { + "epoch": 0.6288010317972121, + "grad_norm": 28.58063297313039, + "learning_rate": 4.281217893356478e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 4.375, + "logps/chosen": -944.0, + "logps/rejected": -649.0, + "loss": 0.3924, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8515625, + "rewards/margins": 7.74609375, + "rewards/rejected": -4.900390625, + "step": 3169 + }, + { + "epoch": 0.6289994543380129, + "grad_norm": 37.5468524461853, + "learning_rate": 4.2782180140231084e-07, + "logits/chosen": 3.984375, + "logits/rejected": 4.09375, + "logps/chosen": -1030.0, + "logps/rejected": -935.0, + "loss": 0.391, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.61328125, + "rewards/margins": 8.46875, + "rewards/rejected": -5.8515625, + "step": 3170 + }, + { + "epoch": 0.6291978768788135, + "grad_norm": 33.278124033490656, + "learning_rate": 4.275218720756856e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.42578125, + "logps/chosen": -1193.0, + "logps/rejected": -792.0, + "loss": 0.2009, + "rewards/accuracies": 0.96875, + "rewards/chosen": 4.07421875, + "rewards/margins": 10.2421875, + "rewards/rejected": -6.1640625, + "step": 3171 + }, + { + "epoch": 0.6293962994196141, + "grad_norm": 30.813703774852875, + "learning_rate": 4.272220014996427e-07, + "logits/chosen": 3.78125, + "logits/rejected": 3.8828125, + "logps/chosen": -744.5, + "logps/rejected": -542.0, + "loss": 0.355, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.748046875, + "rewards/margins": 9.0234375, + "rewards/rejected": -6.27734375, + "step": 3172 + }, + { + "epoch": 0.6295947219604147, + "grad_norm": 38.425568121315315, + "learning_rate": 4.269221898180246e-07, + "logits/chosen": 3.64453125, + "logits/rejected": 3.70703125, + "logps/chosen": -810.0, + "logps/rejected": -735.0, + "loss": 0.5108, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.86328125, + "rewards/margins": 5.0, + "rewards/rejected": -3.13671875, + "step": 3173 + }, + { + "epoch": 0.6297931445012154, + "grad_norm": 29.53215357470659, + "learning_rate": 4.2662243717464574e-07, + "logits/chosen": 4.109375, + "logits/rejected": 3.75390625, + "logps/chosen": -1032.0, + "logps/rejected": -767.5, + "loss": 0.3069, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.296875, + "rewards/margins": 7.828125, + "rewards/rejected": -4.53125, + "step": 3174 + }, + { + "epoch": 0.629991567042016, + "grad_norm": 25.996056511487623, + "learning_rate": 4.2632274371329234e-07, + "logits/chosen": 3.59375, + "logits/rejected": 3.70703125, + "logps/chosen": -1101.0, + "logps/rejected": -699.0, + "loss": 0.3181, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.18359375, + "rewards/margins": 8.703125, + "rewards/rejected": -5.5, + "step": 3175 + }, + { + "epoch": 0.6301899895828166, + "grad_norm": 35.52309502057912, + "learning_rate": 4.260231095777218e-07, + "logits/chosen": 4.25, + "logits/rejected": 4.4453125, + "logps/chosen": -1092.5, + "logps/rejected": -628.5, + "loss": 0.443, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.703125, + "rewards/margins": 6.453125, + "rewards/rejected": -3.759765625, + "step": 3176 + }, + { + "epoch": 0.6303884121236173, + "grad_norm": 37.75598390005294, + "learning_rate": 4.2572353491166356e-07, + "logits/chosen": 3.72265625, + "logits/rejected": 3.62890625, + "logps/chosen": -947.0, + "logps/rejected": -828.0, + "loss": 0.4717, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.599609375, + "rewards/margins": 6.234375, + "rewards/rejected": -4.63671875, + "step": 3177 + }, + { + "epoch": 0.6305868346644179, + "grad_norm": 36.09891078457141, + "learning_rate": 4.254240198588178e-07, + "logits/chosen": 4.46875, + "logits/rejected": 4.671875, + "logps/chosen": -1167.0, + "logps/rejected": -1029.0, + "loss": 0.4326, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9140625, + "rewards/margins": 7.921875, + "rewards/rejected": -5.015625, + "step": 3178 + }, + { + "epoch": 0.6307852572052185, + "grad_norm": 40.62419586847054, + "learning_rate": 4.2512456456285696e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 4.25, + "logps/chosen": -684.0, + "logps/rejected": -537.0, + "loss": 0.4942, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.654022216796875, + "rewards/margins": 5.859375, + "rewards/rejected": -4.208984375, + "step": 3179 + }, + { + "epoch": 0.6309836797460191, + "grad_norm": 28.946704654819495, + "learning_rate": 4.248251691674242e-07, + "logits/chosen": 3.79296875, + "logits/rejected": 3.98046875, + "logps/chosen": -793.0, + "logps/rejected": -671.5, + "loss": 0.433, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5, + "rewards/margins": 6.8984375, + "rewards/rejected": -4.41015625, + "step": 3180 + }, + { + "epoch": 0.6311821022868198, + "grad_norm": 38.91455065112409, + "learning_rate": 4.2452583381613427e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.890625, + "logps/chosen": -985.0, + "logps/rejected": -539.5, + "loss": 0.4977, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.23828125, + "rewards/margins": 6.703125, + "rewards/rejected": -4.45703125, + "step": 3181 + }, + { + "epoch": 0.6313805248276204, + "grad_norm": 24.95910536422301, + "learning_rate": 4.242265586525733e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 3.9296875, + "logps/chosen": -1146.0, + "logps/rejected": -739.0, + "loss": 0.2855, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.69140625, + "rewards/margins": 9.28125, + "rewards/rejected": -6.578125, + "step": 3182 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 28.082404994159358, + "learning_rate": 4.2392734382029774e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.12109375, + "logps/chosen": -1120.0, + "logps/rejected": -810.0, + "loss": 0.4805, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.640625, + "rewards/margins": 7.6484375, + "rewards/rejected": -5.00390625, + "step": 3183 + }, + { + "epoch": 0.6317773699092217, + "grad_norm": 24.41314744091721, + "learning_rate": 4.2362818946283594e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 4.06640625, + "logps/chosen": -704.5, + "logps/rejected": -601.5, + "loss": 0.3423, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4375, + "rewards/margins": 9.046875, + "rewards/rejected": -6.60546875, + "step": 3184 + }, + { + "epoch": 0.6319757924500223, + "grad_norm": 27.075043099281825, + "learning_rate": 4.233290957236869e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 3.90625, + "logps/chosen": -951.0, + "logps/rejected": -827.0, + "loss": 0.3724, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4375, + "rewards/margins": 9.7265625, + "rewards/rejected": -7.28125, + "step": 3185 + }, + { + "epoch": 0.6321742149908229, + "grad_norm": 22.87856469675754, + "learning_rate": 4.230300627463206e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 4.08984375, + "logps/chosen": -993.0, + "logps/rejected": -800.0, + "loss": 0.3526, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.625, + "rewards/margins": 8.3515625, + "rewards/rejected": -5.734375, + "step": 3186 + }, + { + "epoch": 0.6323726375316235, + "grad_norm": 29.24758211188395, + "learning_rate": 4.227310906741777e-07, + "logits/chosen": 4.44140625, + "logits/rejected": 4.4375, + "logps/chosen": -841.5, + "logps/rejected": -800.5, + "loss": 0.2431, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.8046875, + "rewards/margins": 9.96875, + "rewards/rejected": -6.1484375, + "step": 3187 + }, + { + "epoch": 0.6325710600724243, + "grad_norm": 31.595755504521094, + "learning_rate": 4.2243217965067003e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.2421875, + "logps/chosen": -1251.0, + "logps/rejected": -1492.0, + "loss": 0.2593, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.931640625, + "rewards/margins": 10.859375, + "rewards/rejected": -7.90625, + "step": 3188 + }, + { + "epoch": 0.6327694826132249, + "grad_norm": 24.092473789936616, + "learning_rate": 4.221333298191796e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 3.8515625, + "logps/chosen": -1187.0, + "logps/rejected": -989.5, + "loss": 0.2593, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.30859375, + "rewards/margins": 10.5546875, + "rewards/rejected": -7.2265625, + "step": 3189 + }, + { + "epoch": 0.6329679051540255, + "grad_norm": 31.892453438193968, + "learning_rate": 4.218345413230595e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 4.3359375, + "logps/chosen": -848.0, + "logps/rejected": -761.0, + "loss": 0.4713, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.625, + "rewards/margins": 7.1171875, + "rewards/rejected": -4.47265625, + "step": 3190 + }, + { + "epoch": 0.6331663276948262, + "grad_norm": 33.065384107192266, + "learning_rate": 4.215358143056333e-07, + "logits/chosen": 3.5, + "logits/rejected": 3.44921875, + "logps/chosen": -1180.0, + "logps/rejected": -1138.0, + "loss": 0.3504, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.626953125, + "rewards/margins": 10.578125, + "rewards/rejected": -7.9609375, + "step": 3191 + }, + { + "epoch": 0.6333647502356268, + "grad_norm": 29.38414330156847, + "learning_rate": 4.2123714891019514e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.296875, + "logps/chosen": -1121.5, + "logps/rejected": -757.0, + "loss": 0.3597, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.70703125, + "rewards/margins": 8.015625, + "rewards/rejected": -5.30859375, + "step": 3192 + }, + { + "epoch": 0.6335631727764274, + "grad_norm": 31.71726783474024, + "learning_rate": 4.209385452800095e-07, + "logits/chosen": 4.046875, + "logits/rejected": 3.859375, + "logps/chosen": -1192.0, + "logps/rejected": -1073.0, + "loss": 0.2847, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.159912109375, + "rewards/margins": 10.890625, + "rewards/rejected": -7.7421875, + "step": 3193 + }, + { + "epoch": 0.6337615953172281, + "grad_norm": 30.187846516577984, + "learning_rate": 4.2064000355831086e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 4.05859375, + "logps/chosen": -831.0, + "logps/rejected": -763.5, + "loss": 0.4447, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.51171875, + "rewards/margins": 6.03125, + "rewards/rejected": -3.521484375, + "step": 3194 + }, + { + "epoch": 0.6339600178580287, + "grad_norm": 34.74520914021198, + "learning_rate": 4.2034152388830493e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 4.375, + "logps/chosen": -657.5, + "logps/rejected": -714.5, + "loss": 0.4061, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.86328125, + "rewards/margins": 7.15625, + "rewards/rejected": -5.296875, + "step": 3195 + }, + { + "epoch": 0.6341584403988293, + "grad_norm": 33.338330762897456, + "learning_rate": 4.2004310641316667e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.7890625, + "logps/chosen": -1079.0, + "logps/rejected": -607.0, + "loss": 0.2962, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.720703125, + "rewards/margins": 8.8515625, + "rewards/rejected": -6.1328125, + "step": 3196 + }, + { + "epoch": 0.6343568629396299, + "grad_norm": 32.60890695336736, + "learning_rate": 4.197447512760417e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 3.80859375, + "logps/chosen": -907.0, + "logps/rejected": -609.0, + "loss": 0.4273, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.251953125, + "rewards/margins": 7.3046875, + "rewards/rejected": -5.05078125, + "step": 3197 + }, + { + "epoch": 0.6345552854804306, + "grad_norm": 29.569240053834765, + "learning_rate": 4.1944645862004604e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.16796875, + "logps/chosen": -1099.0, + "logps/rejected": -852.0, + "loss": 0.3153, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.984375, + "rewards/margins": 10.234375, + "rewards/rejected": -7.2578125, + "step": 3198 + }, + { + "epoch": 0.6347537080212312, + "grad_norm": 30.081581205735585, + "learning_rate": 4.1914822858826493e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.78515625, + "logps/chosen": -1122.0, + "logps/rejected": -750.0, + "loss": 0.2012, + "rewards/accuracies": 0.96875, + "rewards/chosen": 4.02734375, + "rewards/margins": 11.03125, + "rewards/rejected": -7.015625, + "step": 3199 + }, + { + "epoch": 0.6349521305620318, + "grad_norm": 28.760008982543965, + "learning_rate": 4.1885006132375434e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.828125, + "logps/chosen": -588.5, + "logps/rejected": -545.0, + "loss": 0.5394, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.65625, + "rewards/margins": 5.375, + "rewards/rejected": -3.71484375, + "step": 3200 + }, + { + "epoch": 0.6351505531028325, + "grad_norm": 36.438579843458015, + "learning_rate": 4.185519569695395e-07, + "logits/chosen": 3.66015625, + "logits/rejected": 3.82421875, + "logps/chosen": -1074.0, + "logps/rejected": -894.0, + "loss": 0.483, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.14453125, + "rewards/margins": 8.4609375, + "rewards/rejected": -6.3203125, + "step": 3201 + }, + { + "epoch": 0.6353489756436331, + "grad_norm": 32.20233235284624, + "learning_rate": 4.1825391566861625e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 4.09765625, + "logps/chosen": -976.0, + "logps/rejected": -1374.0, + "loss": 0.4266, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.3623046875, + "rewards/margins": 8.0625, + "rewards/rejected": -5.7265625, + "step": 3202 + }, + { + "epoch": 0.6355473981844337, + "grad_norm": 33.11986544299368, + "learning_rate": 4.179559375639491e-07, + "logits/chosen": 4.375, + "logits/rejected": 3.8984375, + "logps/chosen": -962.0, + "logps/rejected": -616.0, + "loss": 0.4222, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.46826171875, + "rewards/margins": 6.61328125, + "rewards/rejected": -4.140625, + "step": 3203 + }, + { + "epoch": 0.6357458207252343, + "grad_norm": 30.781468080972697, + "learning_rate": 4.176580227984733e-07, + "logits/chosen": 3.6015625, + "logits/rejected": 3.62890625, + "logps/chosen": -758.5, + "logps/rejected": -669.25, + "loss": 0.4043, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5771484375, + "rewards/margins": 9.26171875, + "rewards/rejected": -6.693359375, + "step": 3204 + }, + { + "epoch": 0.635944243266035, + "grad_norm": 31.195893434631312, + "learning_rate": 4.173601715150931e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.82421875, + "logps/chosen": -1271.0, + "logps/rejected": -936.5, + "loss": 0.4082, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.658203125, + "rewards/margins": 9.4375, + "rewards/rejected": -6.765625, + "step": 3205 + }, + { + "epoch": 0.6361426658068356, + "grad_norm": 30.305697374889842, + "learning_rate": 4.1706238385668247e-07, + "logits/chosen": 3.7265625, + "logits/rejected": 4.05859375, + "logps/chosen": -1136.0, + "logps/rejected": -974.0, + "loss": 0.4562, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.26953125, + "rewards/margins": 20.2109375, + "rewards/rejected": -17.875, + "step": 3206 + }, + { + "epoch": 0.6363410883476363, + "grad_norm": 35.18973701831804, + "learning_rate": 4.167646599660851e-07, + "logits/chosen": 3.984375, + "logits/rejected": 4.0, + "logps/chosen": -973.0, + "logps/rejected": -798.0, + "loss": 0.4399, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.619140625, + "rewards/margins": 7.646484375, + "rewards/rejected": -6.03125, + "step": 3207 + }, + { + "epoch": 0.636539510888437, + "grad_norm": 27.5043434214977, + "learning_rate": 4.1646699998611347e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.16015625, + "logps/chosen": -1059.0, + "logps/rejected": -886.5, + "loss": 0.3233, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.94140625, + "rewards/margins": 9.0703125, + "rewards/rejected": -6.1328125, + "step": 3208 + }, + { + "epoch": 0.6367379334292376, + "grad_norm": 31.957014989554068, + "learning_rate": 4.1616940405955013e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 3.7734375, + "logps/chosen": -1165.0, + "logps/rejected": -761.0, + "loss": 0.5555, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.021484375, + "rewards/margins": 6.09375, + "rewards/rejected": -4.0703125, + "step": 3209 + }, + { + "epoch": 0.6369363559700382, + "grad_norm": 38.26407717092277, + "learning_rate": 4.1587187232914625e-07, + "logits/chosen": 3.6484375, + "logits/rejected": 3.45703125, + "logps/chosen": -864.0, + "logps/rejected": -560.0, + "loss": 0.6882, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.66455078125, + "rewards/margins": 3.45703125, + "rewards/rejected": -2.7900390625, + "step": 3210 + }, + { + "epoch": 0.6371347785108389, + "grad_norm": 34.96759863339199, + "learning_rate": 4.1557440493762264e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 4.3203125, + "logps/chosen": -947.0, + "logps/rejected": -806.0, + "loss": 0.4938, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.4365234375, + "rewards/margins": 6.0595703125, + "rewards/rejected": -4.607421875, + "step": 3211 + }, + { + "epoch": 0.6373332010516395, + "grad_norm": 32.97573926049331, + "learning_rate": 4.1527700202766915e-07, + "logits/chosen": 3.65625, + "logits/rejected": 3.640625, + "logps/chosen": -1074.0, + "logps/rejected": -946.0, + "loss": 0.4181, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.787109375, + "rewards/margins": 8.9765625, + "rewards/rejected": -6.1875, + "step": 3212 + }, + { + "epoch": 0.6375316235924401, + "grad_norm": 38.771860450071294, + "learning_rate": 4.149796637419446e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 3.73828125, + "logps/chosen": -857.0, + "logps/rejected": -442.5, + "loss": 0.4663, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3232421875, + "rewards/margins": 5.3671875, + "rewards/rejected": -3.037109375, + "step": 3213 + }, + { + "epoch": 0.6377300461332407, + "grad_norm": 27.583217272687012, + "learning_rate": 4.146823902230772e-07, + "logits/chosen": 4.484375, + "logits/rejected": 4.3515625, + "logps/chosen": -992.5, + "logps/rejected": -845.0, + "loss": 0.4703, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6875, + "rewards/margins": 7.453125, + "rewards/rejected": -4.7578125, + "step": 3214 + }, + { + "epoch": 0.6379284686740414, + "grad_norm": 24.980765854040502, + "learning_rate": 4.1438518161366323e-07, + "logits/chosen": 4.32421875, + "logits/rejected": 4.453125, + "logps/chosen": -1244.0, + "logps/rejected": -854.0, + "loss": 0.281, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.333984375, + "rewards/margins": 9.6328125, + "rewards/rejected": -6.3125, + "step": 3215 + }, + { + "epoch": 0.638126891214842, + "grad_norm": 30.168836000052806, + "learning_rate": 4.1408803805626893e-07, + "logits/chosen": 3.6015625, + "logits/rejected": 3.6796875, + "logps/chosen": -1179.0, + "logps/rejected": -650.5, + "loss": 0.4057, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.525390625, + "rewards/margins": 7.5390625, + "rewards/rejected": -5.0078125, + "step": 3216 + }, + { + "epoch": 0.6383253137556426, + "grad_norm": 24.3091167579243, + "learning_rate": 4.137909596934284e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.14453125, + "logps/chosen": -940.0, + "logps/rejected": -812.0, + "loss": 0.3901, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.412109375, + "rewards/margins": 7.6796875, + "rewards/rejected": -5.2578125, + "step": 3217 + }, + { + "epoch": 0.6385237362964433, + "grad_norm": 25.224797276120444, + "learning_rate": 4.1349394666764503e-07, + "logits/chosen": 4.359375, + "logits/rejected": 4.1796875, + "logps/chosen": -1100.0, + "logps/rejected": -775.0, + "loss": 0.4146, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.75390625, + "rewards/margins": 7.48046875, + "rewards/rejected": -5.71875, + "step": 3218 + }, + { + "epoch": 0.6387221588372439, + "grad_norm": 36.58720641306566, + "learning_rate": 4.131969991213906e-07, + "logits/chosen": 3.9375, + "logits/rejected": 3.9921875, + "logps/chosen": -1037.0, + "logps/rejected": -621.0, + "loss": 0.3075, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.45703125, + "rewards/margins": 7.1484375, + "rewards/rejected": -4.671875, + "step": 3219 + }, + { + "epoch": 0.6389205813780445, + "grad_norm": 37.30347199275169, + "learning_rate": 4.1290011719710573e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 4.04296875, + "logps/chosen": -1111.5, + "logps/rejected": -808.0, + "loss": 0.3413, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.27734375, + "rewards/margins": 8.5, + "rewards/rejected": -5.2265625, + "step": 3220 + }, + { + "epoch": 0.6391190039188451, + "grad_norm": 34.863834168645646, + "learning_rate": 4.126033010371991e-07, + "logits/chosen": 4.23046875, + "logits/rejected": 4.2890625, + "logps/chosen": -733.0, + "logps/rejected": -629.0, + "loss": 0.411, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.341796875, + "rewards/margins": 7.4765625, + "rewards/rejected": -5.140625, + "step": 3221 + }, + { + "epoch": 0.6393174264596458, + "grad_norm": 26.303157332017303, + "learning_rate": 4.123065507840484e-07, + "logits/chosen": 3.91015625, + "logits/rejected": 3.4375, + "logps/chosen": -1800.5, + "logps/rejected": -668.5, + "loss": 0.259, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.509765625, + "rewards/margins": 6.1484375, + "rewards/rejected": -5.6328125, + "step": 3222 + }, + { + "epoch": 0.6395158490004464, + "grad_norm": 25.532853815978008, + "learning_rate": 4.120098665799996e-07, + "logits/chosen": 3.875, + "logits/rejected": 4.390625, + "logps/chosen": -923.0, + "logps/rejected": -738.0, + "loss": 0.4049, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5703125, + "rewards/margins": 8.62109375, + "rewards/rejected": -6.04296875, + "step": 3223 + }, + { + "epoch": 0.639714271541247, + "grad_norm": 26.83565479430335, + "learning_rate": 4.1171324856736644e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.1953125, + "logps/chosen": -968.0, + "logps/rejected": -695.0, + "loss": 0.369, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1943359375, + "rewards/margins": 6.8046875, + "rewards/rejected": -4.62109375, + "step": 3224 + }, + { + "epoch": 0.6399126940820478, + "grad_norm": 27.48740978105299, + "learning_rate": 4.114166968884316e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.4375, + "logps/chosen": -907.0, + "logps/rejected": -765.0, + "loss": 0.454, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4609375, + "rewards/margins": 7.98828125, + "rewards/rejected": -5.521484375, + "step": 3225 + }, + { + "epoch": 0.6401111166228484, + "grad_norm": 40.002479000661424, + "learning_rate": 4.1112021168544555e-07, + "logits/chosen": 3.7421875, + "logits/rejected": 3.1171875, + "logps/chosen": -881.0, + "logps/rejected": -531.0, + "loss": 0.4251, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.9132080078125, + "rewards/margins": 6.984375, + "rewards/rejected": -5.07421875, + "step": 3226 + }, + { + "epoch": 0.640309539163649, + "grad_norm": 27.915510163542045, + "learning_rate": 4.108237931006271e-07, + "logits/chosen": 3.8515625, + "logits/rejected": 3.85546875, + "logps/chosen": -764.0, + "logps/rejected": -1828.0, + "loss": 0.4131, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.9228515625, + "rewards/margins": 10.7578125, + "rewards/rejected": -8.828125, + "step": 3227 + }, + { + "epoch": 0.6405079617044497, + "grad_norm": 25.574377891391666, + "learning_rate": 4.105274412761627e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.1875, + "logps/chosen": -1118.0, + "logps/rejected": -936.0, + "loss": 0.4351, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.79296875, + "rewards/margins": 9.3359375, + "rewards/rejected": -6.5234375, + "step": 3228 + }, + { + "epoch": 0.6407063842452503, + "grad_norm": 38.30744248786844, + "learning_rate": 4.1023115635420726e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.375, + "logps/chosen": -710.0, + "logps/rejected": -1737.0, + "loss": 0.4045, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.939453125, + "rewards/margins": 9.484375, + "rewards/rejected": -7.53515625, + "step": 3229 + }, + { + "epoch": 0.6409048067860509, + "grad_norm": 32.75891333810667, + "learning_rate": 4.0993493847688345e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.46875, + "logps/chosen": -858.5, + "logps/rejected": -1363.0, + "loss": 0.351, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.3515625, + "rewards/margins": 8.5625, + "rewards/rejected": -6.203125, + "step": 3230 + }, + { + "epoch": 0.6411032293268515, + "grad_norm": 27.68031788564575, + "learning_rate": 4.096387877862816e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 4.14453125, + "logps/chosen": -925.0, + "logps/rejected": -906.5, + "loss": 0.3269, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.83203125, + "rewards/margins": 8.7578125, + "rewards/rejected": -6.90625, + "step": 3231 + }, + { + "epoch": 0.6413016518676522, + "grad_norm": 35.25793421789061, + "learning_rate": 4.0934270442446007e-07, + "logits/chosen": 3.921875, + "logits/rejected": 4.0390625, + "logps/chosen": -1096.0, + "logps/rejected": -861.0, + "loss": 0.4683, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.17578125, + "rewards/margins": 7.23828125, + "rewards/rejected": -5.0703125, + "step": 3232 + }, + { + "epoch": 0.6415000744084528, + "grad_norm": 36.29022035528275, + "learning_rate": 4.090466885334445e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 4.1875, + "logps/chosen": -1143.0, + "logps/rejected": -1561.5, + "loss": 0.4591, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.076171875, + "rewards/margins": 10.15625, + "rewards/rejected": -8.0859375, + "step": 3233 + }, + { + "epoch": 0.6416984969492534, + "grad_norm": 23.593839428159324, + "learning_rate": 4.087507402552288e-07, + "logits/chosen": 3.6953125, + "logits/rejected": 3.7109375, + "logps/chosen": -804.0, + "logps/rejected": -791.5, + "loss": 0.4869, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.830078125, + "rewards/margins": 7.546875, + "rewards/rejected": -5.703125, + "step": 3234 + }, + { + "epoch": 0.6418969194900541, + "grad_norm": 30.5782469309142, + "learning_rate": 4.08454859731774e-07, + "logits/chosen": 4.0625, + "logits/rejected": 4.20703125, + "logps/chosen": -1049.0, + "logps/rejected": -880.0, + "loss": 0.3021, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.8984375, + "rewards/margins": 9.4609375, + "rewards/rejected": -5.572265625, + "step": 3235 + }, + { + "epoch": 0.6420953420308547, + "grad_norm": 29.883938412829114, + "learning_rate": 4.081590471050085e-07, + "logits/chosen": 3.76171875, + "logits/rejected": 3.6328125, + "logps/chosen": -1046.0, + "logps/rejected": -982.0, + "loss": 0.4982, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.2080078125, + "rewards/margins": 7.8046875, + "rewards/rejected": -5.58984375, + "step": 3236 + }, + { + "epoch": 0.6422937645716553, + "grad_norm": 35.997535696517964, + "learning_rate": 4.0786330251682855e-07, + "logits/chosen": 4.26171875, + "logits/rejected": 4.1484375, + "logps/chosen": -905.0, + "logps/rejected": -687.0, + "loss": 0.4712, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.45166015625, + "rewards/margins": 6.953125, + "rewards/rejected": -4.51171875, + "step": 3237 + }, + { + "epoch": 0.6424921871124559, + "grad_norm": 25.055529255935074, + "learning_rate": 4.075676261090976e-07, + "logits/chosen": 4.28515625, + "logits/rejected": 4.1015625, + "logps/chosen": -906.0, + "logps/rejected": -643.0, + "loss": 0.4211, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.86328125, + "rewards/margins": 7.2265625, + "rewards/rejected": -4.37109375, + "step": 3238 + }, + { + "epoch": 0.6426906096532566, + "grad_norm": 32.03150301578797, + "learning_rate": 4.0727201802364655e-07, + "logits/chosen": 3.78125, + "logits/rejected": 3.6953125, + "logps/chosen": -855.0, + "logps/rejected": -673.0, + "loss": 0.5003, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4072265625, + "rewards/margins": 6.9453125, + "rewards/rejected": -4.546875, + "step": 3239 + }, + { + "epoch": 0.6428890321940572, + "grad_norm": 30.243946139951525, + "learning_rate": 4.069764784022729e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 3.9765625, + "logps/chosen": -861.0, + "logps/rejected": -769.5, + "loss": 0.3721, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.529296875, + "rewards/margins": 8.7265625, + "rewards/rejected": -7.203125, + "step": 3240 + }, + { + "epoch": 0.6430874547348578, + "grad_norm": 33.608381791190524, + "learning_rate": 4.0668100738674205e-07, + "logits/chosen": 3.75, + "logits/rejected": 3.7890625, + "logps/chosen": -905.0, + "logps/rejected": -718.0, + "loss": 0.3775, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.09375, + "rewards/margins": 7.703125, + "rewards/rejected": -5.6171875, + "step": 3241 + }, + { + "epoch": 0.6432858772756586, + "grad_norm": 28.684153055092636, + "learning_rate": 4.0638560511878607e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 4.1796875, + "logps/chosen": -1183.0, + "logps/rejected": -783.0, + "loss": 0.4488, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.333984375, + "rewards/margins": 7.2421875, + "rewards/rejected": -4.90625, + "step": 3242 + }, + { + "epoch": 0.6434842998164592, + "grad_norm": 29.71858411569147, + "learning_rate": 4.0609027174010444e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 4.0625, + "logps/chosen": -916.0, + "logps/rejected": -729.0, + "loss": 0.4257, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.947265625, + "rewards/margins": 8.640625, + "rewards/rejected": -5.6796875, + "step": 3243 + }, + { + "epoch": 0.6436827223572598, + "grad_norm": 34.433885713466694, + "learning_rate": 4.0579500739236307e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.07421875, + "logps/chosen": -702.5, + "logps/rejected": -541.0, + "loss": 0.6686, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.482666015625, + "rewards/margins": 3.953125, + "rewards/rejected": -2.4736328125, + "step": 3244 + }, + { + "epoch": 0.6438811448980604, + "grad_norm": 37.661227553148244, + "learning_rate": 4.05499812217195e-07, + "logits/chosen": 3.5859375, + "logits/rejected": 3.7109375, + "logps/chosen": -877.5, + "logps/rejected": -920.0, + "loss": 0.5445, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.82421875, + "rewards/margins": 8.296875, + "rewards/rejected": -6.47265625, + "step": 3245 + }, + { + "epoch": 0.6440795674388611, + "grad_norm": 33.03414669871045, + "learning_rate": 4.052046863562003e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.2734375, + "logps/chosen": -1047.0, + "logps/rejected": -683.0, + "loss": 0.3839, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4423828125, + "rewards/margins": 7.484375, + "rewards/rejected": -5.041015625, + "step": 3246 + }, + { + "epoch": 0.6442779899796617, + "grad_norm": 39.47244407008011, + "learning_rate": 4.049096299509455e-07, + "logits/chosen": 4.109375, + "logits/rejected": 3.95703125, + "logps/chosen": -889.0, + "logps/rejected": -771.5, + "loss": 0.4958, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.0966796875, + "rewards/margins": 6.359375, + "rewards/rejected": -4.26171875, + "step": 3247 + }, + { + "epoch": 0.6444764125204623, + "grad_norm": 24.43624545498123, + "learning_rate": 4.04614643142964e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 4.0859375, + "logps/chosen": -1131.0, + "logps/rejected": -970.0, + "loss": 0.221, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.93359375, + "rewards/margins": 11.3125, + "rewards/rejected": -8.3671875, + "step": 3248 + }, + { + "epoch": 0.644674835061263, + "grad_norm": 40.68225492289948, + "learning_rate": 4.043197260737555e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.89453125, + "logps/chosen": -807.0, + "logps/rejected": -620.0, + "loss": 0.4148, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.232421875, + "rewards/margins": 6.8203125, + "rewards/rejected": -4.58984375, + "step": 3249 + }, + { + "epoch": 0.6448732576020636, + "grad_norm": 33.562735319243544, + "learning_rate": 4.0402487888478685e-07, + "logits/chosen": 4.15625, + "logits/rejected": 3.9765625, + "logps/chosen": -748.0, + "logps/rejected": -580.5, + "loss": 0.393, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.046875, + "rewards/margins": 7.671875, + "rewards/rejected": -5.625, + "step": 3250 + }, + { + "epoch": 0.6450716801428642, + "grad_norm": 45.01471643360857, + "learning_rate": 4.0373010171749066e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.90234375, + "logps/chosen": -1051.0, + "logps/rejected": -902.5, + "loss": 0.4426, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.138671875, + "rewards/margins": 8.41796875, + "rewards/rejected": -6.27734375, + "step": 3251 + }, + { + "epoch": 0.6452701026836649, + "grad_norm": 26.84275290443995, + "learning_rate": 4.034353947132666e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.8671875, + "logps/chosen": -1238.0, + "logps/rejected": -776.5, + "loss": 0.3356, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2265625, + "rewards/margins": 8.875, + "rewards/rejected": -5.66015625, + "step": 3252 + }, + { + "epoch": 0.6454685252244655, + "grad_norm": 34.99772934484841, + "learning_rate": 4.0314075801348005e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.86328125, + "logps/chosen": -992.0, + "logps/rejected": -794.0, + "loss": 0.2995, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.650390625, + "rewards/margins": 9.1171875, + "rewards/rejected": -6.45703125, + "step": 3253 + }, + { + "epoch": 0.6456669477652661, + "grad_norm": 27.671378531829365, + "learning_rate": 4.028461917594631e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 4.12890625, + "logps/chosen": -1302.0, + "logps/rejected": -2077.0, + "loss": 0.3721, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.27392578125, + "rewards/margins": 11.1015625, + "rewards/rejected": -8.84375, + "step": 3254 + }, + { + "epoch": 0.6458653703060667, + "grad_norm": 35.62614100314334, + "learning_rate": 4.025516960925144e-07, + "logits/chosen": 4.25, + "logits/rejected": 3.71484375, + "logps/chosen": -1119.0, + "logps/rejected": -1285.0, + "loss": 0.2995, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.23828125, + "rewards/margins": 10.65625, + "rewards/rejected": -8.41796875, + "step": 3255 + }, + { + "epoch": 0.6460637928468674, + "grad_norm": 34.851575234881594, + "learning_rate": 4.022572711538975e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 3.98828125, + "logps/chosen": -984.0, + "logps/rejected": -550.5, + "loss": 0.3804, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.263671875, + "rewards/margins": 8.3203125, + "rewards/rejected": -6.0703125, + "step": 3256 + }, + { + "epoch": 0.646262215387668, + "grad_norm": 36.274191003455215, + "learning_rate": 4.0196291708484356e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.2265625, + "logps/chosen": -1352.0, + "logps/rejected": -783.0, + "loss": 0.3469, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.857421875, + "rewards/margins": 9.40625, + "rewards/rejected": -6.5625, + "step": 3257 + }, + { + "epoch": 0.6464606379284686, + "grad_norm": 32.37807623505923, + "learning_rate": 4.016686340265484e-07, + "logits/chosen": 3.27734375, + "logits/rejected": 3.390625, + "logps/chosen": -975.0, + "logps/rejected": -1053.0, + "loss": 0.433, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.73779296875, + "rewards/margins": 16.734375, + "rewards/rejected": -16.03125, + "step": 3258 + }, + { + "epoch": 0.6466590604692694, + "grad_norm": 23.370366675261163, + "learning_rate": 4.013744221201749e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 3.75390625, + "logps/chosen": -1294.0, + "logps/rejected": -706.0, + "loss": 0.3376, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.966796875, + "rewards/margins": 9.2734375, + "rewards/rejected": -6.296875, + "step": 3259 + }, + { + "epoch": 0.64685748301007, + "grad_norm": 37.452702138600294, + "learning_rate": 4.010802815068509e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.3046875, + "logps/chosen": -865.0, + "logps/rejected": -601.0, + "loss": 0.3899, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.259765625, + "rewards/margins": 7.49609375, + "rewards/rejected": -5.232421875, + "step": 3260 + }, + { + "epoch": 0.6470559055508706, + "grad_norm": 32.74698176822162, + "learning_rate": 4.007862123276705e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.03125, + "logps/chosen": -740.5, + "logps/rejected": -737.0, + "loss": 0.4418, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.099609375, + "rewards/margins": 8.703125, + "rewards/rejected": -6.609375, + "step": 3261 + }, + { + "epoch": 0.6472543280916712, + "grad_norm": 32.924842227928124, + "learning_rate": 4.0049221472369367e-07, + "logits/chosen": 3.58203125, + "logits/rejected": 3.765625, + "logps/chosen": -1059.0, + "logps/rejected": -748.0, + "loss": 0.4821, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.462890625, + "rewards/margins": 8.4609375, + "rewards/rejected": -6.015625, + "step": 3262 + }, + { + "epoch": 0.6474527506324719, + "grad_norm": 43.03520326495205, + "learning_rate": 4.0019828883594553e-07, + "logits/chosen": 3.6640625, + "logits/rejected": 3.5, + "logps/chosen": -1332.0, + "logps/rejected": -685.0, + "loss": 0.4152, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.025390625, + "rewards/margins": 7.8828125, + "rewards/rejected": -5.85546875, + "step": 3263 + }, + { + "epoch": 0.6476511731732725, + "grad_norm": 40.853084436289, + "learning_rate": 3.9990443480541744e-07, + "logits/chosen": 3.5, + "logits/rejected": 3.27734375, + "logps/chosen": -847.0, + "logps/rejected": -697.0, + "loss": 0.3968, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.724609375, + "rewards/margins": 7.88671875, + "rewards/rejected": -6.16015625, + "step": 3264 + }, + { + "epoch": 0.6478495957140731, + "grad_norm": 23.400447503682408, + "learning_rate": 3.996106527730656e-07, + "logits/chosen": 4.34765625, + "logits/rejected": 4.25, + "logps/chosen": -1320.0, + "logps/rejected": -1091.0, + "loss": 0.3807, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.244140625, + "rewards/margins": 10.0625, + "rewards/rejected": -7.8125, + "step": 3265 + }, + { + "epoch": 0.6480480182548738, + "grad_norm": 51.43978621696064, + "learning_rate": 3.993169428798123e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 3.66796875, + "logps/chosen": -1004.0, + "logps/rejected": -912.0, + "loss": 0.3266, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.1455078125, + "rewards/margins": 9.75, + "rewards/rejected": -7.59375, + "step": 3266 + }, + { + "epoch": 0.6482464407956744, + "grad_norm": 33.029831034792174, + "learning_rate": 3.990233052665449e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.140625, + "logps/chosen": -1028.5, + "logps/rejected": -889.0, + "loss": 0.432, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.599609375, + "rewards/margins": 8.7265625, + "rewards/rejected": -7.140625, + "step": 3267 + }, + { + "epoch": 0.648444863336475, + "grad_norm": 30.684960795241867, + "learning_rate": 3.987297400741162e-07, + "logits/chosen": 4.5234375, + "logits/rejected": 4.28125, + "logps/chosen": -652.5, + "logps/rejected": -536.5, + "loss": 0.4442, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.572265625, + "rewards/margins": 6.9296875, + "rewards/rejected": -5.34375, + "step": 3268 + }, + { + "epoch": 0.6486432858772757, + "grad_norm": 29.48505108953606, + "learning_rate": 3.9843624744334404e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 3.984375, + "logps/chosen": -1009.0, + "logps/rejected": -653.0, + "loss": 0.3432, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.341796875, + "rewards/margins": 7.859375, + "rewards/rejected": -5.51171875, + "step": 3269 + }, + { + "epoch": 0.6488417084180763, + "grad_norm": 33.54282589884898, + "learning_rate": 3.9814282751501173e-07, + "logits/chosen": 3.73828125, + "logits/rejected": 3.81640625, + "logps/chosen": -1148.0, + "logps/rejected": -708.0, + "loss": 0.4233, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.0927734375, + "rewards/margins": 7.94921875, + "rewards/rejected": -5.859375, + "step": 3270 + }, + { + "epoch": 0.6490401309588769, + "grad_norm": 29.929683566571875, + "learning_rate": 3.978494804298675e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 3.8359375, + "logps/chosen": -908.0, + "logps/rejected": -652.5, + "loss": 0.4739, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.884765625, + "rewards/margins": 7.8515625, + "rewards/rejected": -5.9765625, + "step": 3271 + }, + { + "epoch": 0.6492385534996775, + "grad_norm": 33.27644123493782, + "learning_rate": 3.975562063286247e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 3.91015625, + "logps/chosen": -901.0, + "logps/rejected": -996.5, + "loss": 0.3293, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.609375, + "rewards/margins": 8.65625, + "rewards/rejected": -6.05078125, + "step": 3272 + }, + { + "epoch": 0.6494369760404782, + "grad_norm": 41.206920112725314, + "learning_rate": 3.9726300535196225e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 3.703125, + "logps/chosen": -876.0, + "logps/rejected": -745.0, + "loss": 0.523, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.41064453125, + "rewards/margins": 5.1328125, + "rewards/rejected": -3.7109375, + "step": 3273 + }, + { + "epoch": 0.6496353985812788, + "grad_norm": 31.842461991528854, + "learning_rate": 3.969698776405227e-07, + "logits/chosen": 3.41015625, + "logits/rejected": 3.34765625, + "logps/chosen": -913.0, + "logps/rejected": -623.0, + "loss": 0.4561, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.142578125, + "rewards/margins": 5.96875, + "rewards/rejected": -3.82421875, + "step": 3274 + }, + { + "epoch": 0.6498338211220794, + "grad_norm": 36.487288653128566, + "learning_rate": 3.966768233349148e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 4.03125, + "logps/chosen": -730.0, + "logps/rejected": -837.5, + "loss": 0.4211, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.916015625, + "rewards/margins": 7.140625, + "rewards/rejected": -5.216796875, + "step": 3275 + }, + { + "epoch": 0.6500322436628801, + "grad_norm": 44.0617184309784, + "learning_rate": 3.963838425757109e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 3.91796875, + "logps/chosen": -924.0, + "logps/rejected": -617.5, + "loss": 0.3771, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.09375, + "rewards/margins": 7.1875, + "rewards/rejected": -5.109375, + "step": 3276 + }, + { + "epoch": 0.6502306662036808, + "grad_norm": 30.233026890566, + "learning_rate": 3.9609093550344907e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 4.11328125, + "logps/chosen": -908.0, + "logps/rejected": -843.0, + "loss": 0.5078, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.214599609375, + "rewards/margins": 7.15625, + "rewards/rejected": -5.9453125, + "step": 3277 + }, + { + "epoch": 0.6504290887444814, + "grad_norm": 38.8104117862501, + "learning_rate": 3.9579810225863164e-07, + "logits/chosen": 4.0625, + "logits/rejected": 4.06640625, + "logps/chosen": -1195.0, + "logps/rejected": -1196.0, + "loss": 0.3148, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.94140625, + "rewards/margins": 9.4453125, + "rewards/rejected": -6.4921875, + "step": 3278 + }, + { + "epoch": 0.650627511285282, + "grad_norm": 30.62441210546308, + "learning_rate": 3.9550534298172534e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 3.8046875, + "logps/chosen": -1271.0, + "logps/rejected": -703.0, + "loss": 0.5117, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.79296875, + "rewards/margins": 6.267578125, + "rewards/rejected": -3.46435546875, + "step": 3279 + }, + { + "epoch": 0.6508259338260827, + "grad_norm": 35.26460308421036, + "learning_rate": 3.952126578131619e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.10546875, + "logps/chosen": -887.0, + "logps/rejected": -681.0, + "loss": 0.4145, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.83203125, + "rewards/margins": 7.5234375, + "rewards/rejected": -5.6796875, + "step": 3280 + }, + { + "epoch": 0.6510243563668833, + "grad_norm": 34.430274551698524, + "learning_rate": 3.949200468933367e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.45703125, + "logps/chosen": -1025.0, + "logps/rejected": -664.0, + "loss": 0.3505, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.375, + "rewards/margins": 7.59375, + "rewards/rejected": -5.23046875, + "step": 3281 + }, + { + "epoch": 0.6512227789076839, + "grad_norm": 28.255651729908294, + "learning_rate": 3.946275103626103e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.28125, + "logps/chosen": -929.0, + "logps/rejected": -594.0, + "loss": 0.4234, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3564453125, + "rewards/margins": 7.953125, + "rewards/rejected": -5.6015625, + "step": 3282 + }, + { + "epoch": 0.6514212014484846, + "grad_norm": 35.690056717232444, + "learning_rate": 3.9433504836130736e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.0546875, + "logps/chosen": -1178.0, + "logps/rejected": -1247.0, + "loss": 0.2824, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2548828125, + "rewards/margins": 9.9453125, + "rewards/rejected": -7.671875, + "step": 3283 + }, + { + "epoch": 0.6516196239892852, + "grad_norm": 35.035666516511576, + "learning_rate": 3.9404266102971674e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.296875, + "logps/chosen": -946.0, + "logps/rejected": -547.0, + "loss": 0.4276, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3515625, + "rewards/margins": 8.1015625, + "rewards/rejected": -5.7578125, + "step": 3284 + }, + { + "epoch": 0.6518180465300858, + "grad_norm": 24.045843800342713, + "learning_rate": 3.9375034850809105e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.1015625, + "logps/chosen": -1254.0, + "logps/rejected": -875.0, + "loss": 0.3484, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.5625, + "rewards/margins": 10.1484375, + "rewards/rejected": -6.5703125, + "step": 3285 + }, + { + "epoch": 0.6520164690708865, + "grad_norm": 25.87811796623815, + "learning_rate": 3.934581109366477e-07, + "logits/chosen": 4.4765625, + "logits/rejected": 4.2421875, + "logps/chosen": -1074.5, + "logps/rejected": -631.5, + "loss": 0.3368, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.296875, + "rewards/margins": 8.9609375, + "rewards/rejected": -5.671875, + "step": 3286 + }, + { + "epoch": 0.6522148916116871, + "grad_norm": 33.57295941364868, + "learning_rate": 3.9316594845556803e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 3.84375, + "logps/chosen": -1254.0, + "logps/rejected": -900.0, + "loss": 0.357, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.2734375, + "rewards/margins": 9.8828125, + "rewards/rejected": -7.59375, + "step": 3287 + }, + { + "epoch": 0.6524133141524877, + "grad_norm": 24.996214559161796, + "learning_rate": 3.9287386120499677e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 3.8203125, + "logps/chosen": -1394.0, + "logps/rejected": -842.0, + "loss": 0.338, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.4296875, + "rewards/margins": 9.53125, + "rewards/rejected": -6.09375, + "step": 3288 + }, + { + "epoch": 0.6526117366932883, + "grad_norm": 34.95638879102638, + "learning_rate": 3.9258184932504346e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 3.9140625, + "logps/chosen": -989.0, + "logps/rejected": -716.0, + "loss": 0.3576, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.86328125, + "rewards/margins": 7.59375, + "rewards/rejected": -4.74609375, + "step": 3289 + }, + { + "epoch": 0.652810159234089, + "grad_norm": 37.51647466178745, + "learning_rate": 3.922899129557805e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 3.92578125, + "logps/chosen": -2344.0, + "logps/rejected": -700.0, + "loss": 0.3315, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.44140625, + "rewards/margins": 6.1640625, + "rewards/rejected": -6.59375, + "step": 3290 + }, + { + "epoch": 0.6530085817748896, + "grad_norm": 36.2301526896956, + "learning_rate": 3.9199805223724513e-07, + "logits/chosen": 4.09375, + "logits/rejected": 4.30078125, + "logps/chosen": -921.0, + "logps/rejected": -921.5, + "loss": 0.4848, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3046875, + "rewards/margins": 7.28125, + "rewards/rejected": -4.98046875, + "step": 3291 + }, + { + "epoch": 0.6532070043156902, + "grad_norm": 39.78398163696166, + "learning_rate": 3.9170626730943725e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 3.58203125, + "logps/chosen": -822.0, + "logps/rejected": -1039.5, + "loss": 0.2507, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.40625, + "rewards/margins": 9.1640625, + "rewards/rejected": -6.74609375, + "step": 3292 + }, + { + "epoch": 0.653405426856491, + "grad_norm": 27.945907431148054, + "learning_rate": 3.9141455831232106e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 4.3515625, + "logps/chosen": -1014.0, + "logps/rejected": -737.0, + "loss": 0.3341, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.03515625, + "rewards/margins": 9.5, + "rewards/rejected": -6.46875, + "step": 3293 + }, + { + "epoch": 0.6536038493972915, + "grad_norm": 29.012543456376406, + "learning_rate": 3.911229253858246e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.1484375, + "logps/chosen": -1347.0, + "logps/rejected": -1322.0, + "loss": 0.3896, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.28515625, + "rewards/margins": 10.203125, + "rewards/rejected": -7.9296875, + "step": 3294 + }, + { + "epoch": 0.6538022719380921, + "grad_norm": 26.138138128072242, + "learning_rate": 3.9083136866983837e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.34375, + "logps/chosen": -1162.0, + "logps/rejected": -686.0, + "loss": 0.4457, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.37890625, + "rewards/margins": 7.53125, + "rewards/rejected": -5.1484375, + "step": 3295 + }, + { + "epoch": 0.6540006944788928, + "grad_norm": 30.228225917454225, + "learning_rate": 3.9053988830421735e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 4.50390625, + "logps/chosen": -1061.0, + "logps/rejected": -2825.0, + "loss": 0.5116, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.83203125, + "rewards/margins": 14.59375, + "rewards/rejected": -12.78125, + "step": 3296 + }, + { + "epoch": 0.6541991170196935, + "grad_norm": 29.161689875277844, + "learning_rate": 3.9024848442877935e-07, + "logits/chosen": 4.50390625, + "logits/rejected": 4.28515625, + "logps/chosen": -882.0, + "logps/rejected": -609.0, + "loss": 0.3607, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.599609375, + "rewards/margins": 7.296875, + "rewards/rejected": -4.6953125, + "step": 3297 + }, + { + "epoch": 0.6543975395604941, + "grad_norm": 32.182958110951645, + "learning_rate": 3.899571571833057e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 4.3125, + "logps/chosen": -727.5, + "logps/rejected": -764.0, + "loss": 0.3807, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.537109375, + "rewards/margins": 8.7421875, + "rewards/rejected": -6.21875, + "step": 3298 + }, + { + "epoch": 0.6545959621012947, + "grad_norm": 35.40534066861923, + "learning_rate": 3.8966590670754075e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.17578125, + "logps/chosen": -944.0, + "logps/rejected": -794.5, + "loss": 0.4143, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.98046875, + "rewards/margins": 6.5859375, + "rewards/rejected": -4.5859375, + "step": 3299 + }, + { + "epoch": 0.6547943846420954, + "grad_norm": 34.2426237504604, + "learning_rate": 3.893747331411924e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 4.06640625, + "logps/chosen": -940.0, + "logps/rejected": -765.0, + "loss": 0.3941, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.50390625, + "rewards/margins": 8.8984375, + "rewards/rejected": -6.3984375, + "step": 3300 + }, + { + "epoch": 0.654992807182896, + "grad_norm": 28.623031047675738, + "learning_rate": 3.890836366239312e-07, + "logits/chosen": 4.015625, + "logits/rejected": 3.87890625, + "logps/chosen": -1087.0, + "logps/rejected": -750.0, + "loss": 0.2454, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.576171875, + "rewards/margins": 9.90625, + "rewards/rejected": -6.328125, + "step": 3301 + }, + { + "epoch": 0.6551912297236966, + "grad_norm": 38.98923238701047, + "learning_rate": 3.8879261729539114e-07, + "logits/chosen": 3.79296875, + "logits/rejected": 3.6796875, + "logps/chosen": -998.0, + "logps/rejected": -619.0, + "loss": 0.4833, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.52734375, + "rewards/margins": 5.1015625, + "rewards/rejected": -4.57421875, + "step": 3302 + }, + { + "epoch": 0.6553896522644973, + "grad_norm": 35.07306209920663, + "learning_rate": 3.8850167529516907e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.0859375, + "logps/chosen": -719.0, + "logps/rejected": -644.5, + "loss": 0.3618, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.56640625, + "rewards/margins": 7.390625, + "rewards/rejected": -4.83203125, + "step": 3303 + }, + { + "epoch": 0.6555880748052979, + "grad_norm": 56.77956152550229, + "learning_rate": 3.8821081076282456e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.9609375, + "logps/chosen": -1014.0, + "logps/rejected": -649.0, + "loss": 0.4747, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3984375, + "rewards/margins": 6.1484375, + "rewards/rejected": -3.74609375, + "step": 3304 + }, + { + "epoch": 0.6557864973460985, + "grad_norm": 28.151349944166697, + "learning_rate": 3.8792002383788036e-07, + "logits/chosen": 3.4609375, + "logits/rejected": 3.61328125, + "logps/chosen": -891.0, + "logps/rejected": -846.5, + "loss": 0.4443, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.04443359375, + "rewards/margins": 17.7578125, + "rewards/rejected": -15.6953125, + "step": 3305 + }, + { + "epoch": 0.6559849198868991, + "grad_norm": 33.64116238654369, + "learning_rate": 3.876293146598215e-07, + "logits/chosen": 3.6171875, + "logits/rejected": 3.68359375, + "logps/chosen": -1111.0, + "logps/rejected": -823.0, + "loss": 0.4182, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.47265625, + "rewards/margins": 15.78125, + "rewards/rejected": -13.34765625, + "step": 3306 + }, + { + "epoch": 0.6561833424276998, + "grad_norm": 27.34267130564773, + "learning_rate": 3.873386833680963e-07, + "logits/chosen": 4.28515625, + "logits/rejected": 4.10546875, + "logps/chosen": -1276.0, + "logps/rejected": -917.0, + "loss": 0.355, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.91796875, + "rewards/margins": 9.671875, + "rewards/rejected": -6.75390625, + "step": 3307 + }, + { + "epoch": 0.6563817649685004, + "grad_norm": 25.78767916348185, + "learning_rate": 3.870481301021151e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 4.08984375, + "logps/chosen": -859.0, + "logps/rejected": -665.0, + "loss": 0.3299, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.05859375, + "rewards/margins": 8.34765625, + "rewards/rejected": -5.296875, + "step": 3308 + }, + { + "epoch": 0.656580187509301, + "grad_norm": 24.91792917132117, + "learning_rate": 3.8675765500125146e-07, + "logits/chosen": 3.96875, + "logits/rejected": 3.7890625, + "logps/chosen": -1302.0, + "logps/rejected": -815.0, + "loss": 0.2892, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.84375, + "rewards/margins": 10.0546875, + "rewards/rejected": -6.2109375, + "step": 3309 + }, + { + "epoch": 0.6567786100501017, + "grad_norm": 24.434159922976193, + "learning_rate": 3.8646725820484105e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 4.01953125, + "logps/chosen": -1085.0, + "logps/rejected": -624.5, + "loss": 0.1794, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.984375, + "rewards/margins": 9.71875, + "rewards/rejected": -6.734375, + "step": 3310 + }, + { + "epoch": 0.6569770325909023, + "grad_norm": 34.60317175050206, + "learning_rate": 3.861769398521819e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 4.08203125, + "logps/chosen": -919.0, + "logps/rejected": -836.5, + "loss": 0.357, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.333984375, + "rewards/margins": 17.296875, + "rewards/rejected": -15.015625, + "step": 3311 + }, + { + "epoch": 0.657175455131703, + "grad_norm": 34.772132305121495, + "learning_rate": 3.8588670008253464e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 3.67578125, + "logps/chosen": -1080.0, + "logps/rejected": -660.0, + "loss": 0.3458, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.234375, + "rewards/margins": 7.484375, + "rewards/rejected": -5.23828125, + "step": 3312 + }, + { + "epoch": 0.6573738776725035, + "grad_norm": 35.25452126217319, + "learning_rate": 3.855965390351222e-07, + "logits/chosen": 4.609375, + "logits/rejected": 4.8671875, + "logps/chosen": -1029.0, + "logps/rejected": -2497.0, + "loss": 0.5325, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.7763671875, + "rewards/margins": 10.38671875, + "rewards/rejected": -8.626953125, + "step": 3313 + }, + { + "epoch": 0.6575723002133043, + "grad_norm": 38.655025984289814, + "learning_rate": 3.853064568491298e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.21484375, + "logps/chosen": -961.0, + "logps/rejected": -941.0, + "loss": 0.3497, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6328125, + "rewards/margins": 10.0546875, + "rewards/rejected": -7.421875, + "step": 3314 + }, + { + "epoch": 0.6577707227541049, + "grad_norm": 21.970058836872525, + "learning_rate": 3.850164536637044e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.421875, + "logps/chosen": -959.0, + "logps/rejected": -857.0, + "loss": 0.3433, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6640625, + "rewards/margins": 8.6015625, + "rewards/rejected": -5.9375, + "step": 3315 + }, + { + "epoch": 0.6579691452949055, + "grad_norm": 29.909194699682818, + "learning_rate": 3.8472652961795547e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 3.96484375, + "logps/chosen": -942.0, + "logps/rejected": -732.0, + "loss": 0.274, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.533203125, + "rewards/margins": 8.84375, + "rewards/rejected": -6.3203125, + "step": 3316 + }, + { + "epoch": 0.6581675678357062, + "grad_norm": 36.28507203606784, + "learning_rate": 3.844366848509545e-07, + "logits/chosen": 4.515625, + "logits/rejected": 4.5703125, + "logps/chosen": -1160.0, + "logps/rejected": -1028.0, + "loss": 0.4001, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9169921875, + "rewards/margins": 9.796875, + "rewards/rejected": -7.86328125, + "step": 3317 + }, + { + "epoch": 0.6583659903765068, + "grad_norm": 28.65047211376469, + "learning_rate": 3.8414691950173483e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 3.80859375, + "logps/chosen": -1070.0, + "logps/rejected": -629.0, + "loss": 0.3657, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.650390625, + "rewards/margins": 7.3125, + "rewards/rejected": -4.66064453125, + "step": 3318 + }, + { + "epoch": 0.6585644129173074, + "grad_norm": 30.046682890980048, + "learning_rate": 3.83857233709292e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.171875, + "logps/chosen": -812.0, + "logps/rejected": -875.0, + "loss": 0.3574, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6015625, + "rewards/margins": 8.1875, + "rewards/rejected": -5.5859375, + "step": 3319 + }, + { + "epoch": 0.658762835458108, + "grad_norm": 29.778637762812764, + "learning_rate": 3.8356762761258276e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.23828125, + "logps/chosen": -1005.0, + "logps/rejected": -629.0, + "loss": 0.3662, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.318359375, + "rewards/margins": 7.1015625, + "rewards/rejected": -4.78125, + "step": 3320 + }, + { + "epoch": 0.6589612579989087, + "grad_norm": 26.955785183771127, + "learning_rate": 3.832781013505264e-07, + "logits/chosen": 4.046875, + "logits/rejected": 3.953125, + "logps/chosen": -1286.0, + "logps/rejected": -775.0, + "loss": 0.2315, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.64453125, + "rewards/margins": 9.984375, + "rewards/rejected": -6.3515625, + "step": 3321 + }, + { + "epoch": 0.6591596805397093, + "grad_norm": 35.693138370999044, + "learning_rate": 3.829886550620029e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.30859375, + "logps/chosen": -772.0, + "logps/rejected": -645.5, + "loss": 0.388, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.861328125, + "rewards/margins": 6.7578125, + "rewards/rejected": -4.90234375, + "step": 3322 + }, + { + "epoch": 0.6593581030805099, + "grad_norm": 31.206251894319934, + "learning_rate": 3.8269928888585524e-07, + "logits/chosen": 3.96875, + "logits/rejected": 3.99609375, + "logps/chosen": -1102.0, + "logps/rejected": -1737.0, + "loss": 0.4556, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.166015625, + "rewards/margins": 8.8984375, + "rewards/rejected": -6.70703125, + "step": 3323 + }, + { + "epoch": 0.6595565256213106, + "grad_norm": 36.69161589459943, + "learning_rate": 3.8241000296088667e-07, + "logits/chosen": 4.546875, + "logits/rejected": 4.546875, + "logps/chosen": -781.5, + "logps/rejected": -1487.0, + "loss": 0.5695, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.6943359375, + "rewards/margins": 8.015625, + "rewards/rejected": -6.32421875, + "step": 3324 + }, + { + "epoch": 0.6597549481621112, + "grad_norm": 37.341674921168526, + "learning_rate": 3.8212079742586266e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.15234375, + "logps/chosen": -1184.0, + "logps/rejected": -744.0, + "loss": 0.2901, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.34375, + "rewards/margins": 8.0859375, + "rewards/rejected": -4.75, + "step": 3325 + }, + { + "epoch": 0.6599533707029118, + "grad_norm": 35.64510157068921, + "learning_rate": 3.8183167241951043e-07, + "logits/chosen": 4.0, + "logits/rejected": 3.8515625, + "logps/chosen": -565.5, + "logps/rejected": -519.5, + "loss": 0.6161, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.560546875, + "rewards/margins": 4.56640625, + "rewards/rejected": -2.994140625, + "step": 3326 + }, + { + "epoch": 0.6601517932437125, + "grad_norm": 34.75053352717718, + "learning_rate": 3.815426280805173e-07, + "logits/chosen": 3.828125, + "logits/rejected": 3.61328125, + "logps/chosen": -1104.0, + "logps/rejected": -732.0, + "loss": 0.3577, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.796875, + "rewards/margins": 7.6171875, + "rewards/rejected": -4.8203125, + "step": 3327 + }, + { + "epoch": 0.6603502157845131, + "grad_norm": 31.583031020720565, + "learning_rate": 3.812536645475334e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.3203125, + "logps/chosen": -1250.0, + "logps/rejected": -942.0, + "loss": 0.4356, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.89453125, + "rewards/margins": 7.5234375, + "rewards/rejected": -4.6328125, + "step": 3328 + }, + { + "epoch": 0.6605486383253137, + "grad_norm": 34.068287847741864, + "learning_rate": 3.809647819591689e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.28125, + "logps/chosen": -986.0, + "logps/rejected": -641.5, + "loss": 0.506, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.568359375, + "rewards/margins": 5.314453125, + "rewards/rejected": -3.75634765625, + "step": 3329 + }, + { + "epoch": 0.6607470608661143, + "grad_norm": 29.233965298109602, + "learning_rate": 3.806759804539962e-07, + "logits/chosen": 4.6640625, + "logits/rejected": 4.625, + "logps/chosen": -1762.0, + "logps/rejected": -1657.0, + "loss": 0.3713, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.46484375, + "rewards/margins": 11.21875, + "rewards/rejected": -7.7421875, + "step": 3330 + }, + { + "epoch": 0.6609454834069151, + "grad_norm": 23.900171952919905, + "learning_rate": 3.803872601705477e-07, + "logits/chosen": 4.53125, + "logits/rejected": 4.3828125, + "logps/chosen": -1157.0, + "logps/rejected": -807.0, + "loss": 0.2945, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.03515625, + "rewards/margins": 8.3984375, + "rewards/rejected": -5.37890625, + "step": 3331 + }, + { + "epoch": 0.6611439059477157, + "grad_norm": 26.46431126062608, + "learning_rate": 3.8009862124731763e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 3.96875, + "logps/chosen": -989.0, + "logps/rejected": -718.5, + "loss": 0.3428, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.380859375, + "rewards/margins": 9.0078125, + "rewards/rejected": -6.6328125, + "step": 3332 + }, + { + "epoch": 0.6613423284885163, + "grad_norm": 27.497877910808946, + "learning_rate": 3.7981006382276093e-07, + "logits/chosen": 4.14453125, + "logits/rejected": 4.06640625, + "logps/chosen": -1268.0, + "logps/rejected": -1747.0, + "loss": 0.3297, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8984375, + "rewards/margins": 11.7109375, + "rewards/rejected": -8.8203125, + "step": 3333 + }, + { + "epoch": 0.661540751029317, + "grad_norm": 28.145789645159567, + "learning_rate": 3.795215880352934e-07, + "logits/chosen": 4.0625, + "logits/rejected": 3.98046875, + "logps/chosen": -1336.0, + "logps/rejected": -799.0, + "loss": 0.3995, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7156982421875, + "rewards/margins": 9.1484375, + "rewards/rejected": -6.4296875, + "step": 3334 + }, + { + "epoch": 0.6617391735701176, + "grad_norm": 31.6407061439618, + "learning_rate": 3.792331940232921e-07, + "logits/chosen": 4.4453125, + "logits/rejected": 4.640625, + "logps/chosen": -1136.0, + "logps/rejected": -1092.0, + "loss": 0.4667, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.080078125, + "rewards/margins": 11.8125, + "rewards/rejected": -8.75, + "step": 3335 + }, + { + "epoch": 0.6619375961109182, + "grad_norm": 28.46816237333761, + "learning_rate": 3.7894488192509397e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.2265625, + "logps/chosen": -1001.5, + "logps/rejected": -727.0, + "loss": 0.4326, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.697998046875, + "rewards/margins": 6.25390625, + "rewards/rejected": -4.55078125, + "step": 3336 + }, + { + "epoch": 0.6621360186517188, + "grad_norm": 41.944303629786155, + "learning_rate": 3.7865665187899775e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 3.84375, + "logps/chosen": -764.0, + "logps/rejected": -863.0, + "loss": 0.5238, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.59765625, + "rewards/margins": 6.5234375, + "rewards/rejected": -4.921875, + "step": 3337 + }, + { + "epoch": 0.6623344411925195, + "grad_norm": 47.125174111405705, + "learning_rate": 3.783685040232618e-07, + "logits/chosen": 3.671875, + "logits/rejected": 3.56640625, + "logps/chosen": -1191.0, + "logps/rejected": -865.0, + "loss": 0.3346, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.841796875, + "rewards/margins": 8.7421875, + "rewards/rejected": -5.90625, + "step": 3338 + }, + { + "epoch": 0.6625328637333201, + "grad_norm": 26.152741989483058, + "learning_rate": 3.7808043849610594e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.2578125, + "logps/chosen": -957.0, + "logps/rejected": -709.0, + "loss": 0.4898, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.060546875, + "rewards/margins": 6.53515625, + "rewards/rejected": -4.4765625, + "step": 3339 + }, + { + "epoch": 0.6627312862741207, + "grad_norm": 38.16121767034243, + "learning_rate": 3.777924554357096e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 4.1640625, + "logps/chosen": -1121.0, + "logps/rejected": -1202.5, + "loss": 0.3361, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.541015625, + "rewards/margins": 10.15625, + "rewards/rejected": -7.6328125, + "step": 3340 + }, + { + "epoch": 0.6629297088149214, + "grad_norm": 32.79088195367415, + "learning_rate": 3.775045549802135e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 3.96484375, + "logps/chosen": -1136.0, + "logps/rejected": -1540.0, + "loss": 0.4606, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.0751953125, + "rewards/margins": 9.359375, + "rewards/rejected": -7.2734375, + "step": 3341 + }, + { + "epoch": 0.663128131355722, + "grad_norm": 24.284589817967376, + "learning_rate": 3.7721673726771853e-07, + "logits/chosen": 4.33203125, + "logits/rejected": 4.17578125, + "logps/chosen": -1249.0, + "logps/rejected": -767.0, + "loss": 0.3318, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9609375, + "rewards/margins": 8.9921875, + "rewards/rejected": -6.0234375, + "step": 3342 + }, + { + "epoch": 0.6633265538965226, + "grad_norm": 33.488883803181864, + "learning_rate": 3.7692900243628524e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.43359375, + "logps/chosen": -616.5, + "logps/rejected": -438.5, + "loss": 0.5052, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.3740234375, + "rewards/margins": 5.1875, + "rewards/rejected": -3.810546875, + "step": 3343 + }, + { + "epoch": 0.6635249764373233, + "grad_norm": 30.52942679282785, + "learning_rate": 3.766413506239353e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 3.77734375, + "logps/chosen": -931.0, + "logps/rejected": -701.5, + "loss": 0.4581, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.06640625, + "rewards/margins": 7.2890625, + "rewards/rejected": -4.2275390625, + "step": 3344 + }, + { + "epoch": 0.6637233989781239, + "grad_norm": 30.32516663131225, + "learning_rate": 3.7635378196865e-07, + "logits/chosen": 3.703125, + "logits/rejected": 3.5625, + "logps/chosen": -1188.0, + "logps/rejected": -797.0, + "loss": 0.4131, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.71484375, + "rewards/margins": 8.1796875, + "rewards/rejected": -5.46875, + "step": 3345 + }, + { + "epoch": 0.6639218215189245, + "grad_norm": 29.820431406201664, + "learning_rate": 3.7606629660837095e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 4.0, + "logps/chosen": -1025.0, + "logps/rejected": -691.5, + "loss": 0.3347, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.29296875, + "rewards/margins": 8.6875, + "rewards/rejected": -5.380859375, + "step": 3346 + }, + { + "epoch": 0.6641202440597251, + "grad_norm": 28.21892889398344, + "learning_rate": 3.757788946809998e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 3.921875, + "logps/chosen": -994.0, + "logps/rejected": -1109.0, + "loss": 0.3762, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.390625, + "rewards/margins": 9.0, + "rewards/rejected": -6.6015625, + "step": 3347 + }, + { + "epoch": 0.6643186666005259, + "grad_norm": 40.02300593434155, + "learning_rate": 3.754915763243981e-07, + "logits/chosen": 3.59375, + "logits/rejected": 3.54296875, + "logps/chosen": -957.0, + "logps/rejected": -646.5, + "loss": 0.48, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.4609375, + "rewards/margins": 6.66796875, + "rewards/rejected": -4.19140625, + "step": 3348 + }, + { + "epoch": 0.6645170891413265, + "grad_norm": 35.50676050073324, + "learning_rate": 3.7520434167638736e-07, + "logits/chosen": 4.25, + "logits/rejected": 3.9921875, + "logps/chosen": -1239.0, + "logps/rejected": -794.0, + "loss": 0.3605, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.16015625, + "rewards/margins": 8.5625, + "rewards/rejected": -6.40625, + "step": 3349 + }, + { + "epoch": 0.6647155116821271, + "grad_norm": 28.87630995229595, + "learning_rate": 3.749171908747489e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 3.9140625, + "logps/chosen": -585.25, + "logps/rejected": -565.5, + "loss": 0.5385, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.998046875, + "rewards/margins": 5.990234375, + "rewards/rejected": -3.991455078125, + "step": 3350 + }, + { + "epoch": 0.6649139342229278, + "grad_norm": 26.038118098804603, + "learning_rate": 3.7463012405722426e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.79296875, + "logps/chosen": -862.5, + "logps/rejected": -556.5, + "loss": 0.43, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.17578125, + "rewards/margins": 6.5625, + "rewards/rejected": -4.37890625, + "step": 3351 + }, + { + "epoch": 0.6651123567637284, + "grad_norm": 30.624420137968617, + "learning_rate": 3.743431413615138e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.1640625, + "logps/chosen": -1178.0, + "logps/rejected": -835.0, + "loss": 0.4291, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.703125, + "rewards/margins": 8.125, + "rewards/rejected": -5.41015625, + "step": 3352 + }, + { + "epoch": 0.665310779304529, + "grad_norm": 27.44377390822138, + "learning_rate": 3.740562429252782e-07, + "logits/chosen": 4.25, + "logits/rejected": 4.171875, + "logps/chosen": -1345.0, + "logps/rejected": -731.0, + "loss": 0.2391, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.85546875, + "rewards/margins": 10.6875, + "rewards/rejected": -6.82421875, + "step": 3353 + }, + { + "epoch": 0.6655092018453296, + "grad_norm": 35.62655503833288, + "learning_rate": 3.737694288861375e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.484375, + "logps/chosen": -885.0, + "logps/rejected": -619.5, + "loss": 0.4491, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6328125, + "rewards/margins": 7.2890625, + "rewards/rejected": -4.67578125, + "step": 3354 + }, + { + "epoch": 0.6657076243861303, + "grad_norm": 29.402334659717518, + "learning_rate": 3.7348269938167133e-07, + "logits/chosen": 3.8828125, + "logits/rejected": 3.98828125, + "logps/chosen": -1231.0, + "logps/rejected": -674.5, + "loss": 0.3698, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.400390625, + "rewards/margins": 9.6328125, + "rewards/rejected": -6.23046875, + "step": 3355 + }, + { + "epoch": 0.6659060469269309, + "grad_norm": 31.466899415807717, + "learning_rate": 3.731960545494187e-07, + "logits/chosen": 3.76171875, + "logits/rejected": 3.62109375, + "logps/chosen": -1211.5, + "logps/rejected": -684.5, + "loss": 0.3599, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.748046875, + "rewards/margins": 8.7109375, + "rewards/rejected": -5.9453125, + "step": 3356 + }, + { + "epoch": 0.6661044694677315, + "grad_norm": 26.04561521150575, + "learning_rate": 3.7290949452687804e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.3828125, + "logps/chosen": -1210.0, + "logps/rejected": -1107.0, + "loss": 0.4268, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.24609375, + "rewards/margins": 8.1796875, + "rewards/rejected": -4.93359375, + "step": 3357 + }, + { + "epoch": 0.6663028920085322, + "grad_norm": 25.99809298085888, + "learning_rate": 3.726230194515073e-07, + "logits/chosen": 3.84375, + "logits/rejected": 3.62109375, + "logps/chosen": -950.0, + "logps/rejected": -699.5, + "loss": 0.5039, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.1796875, + "rewards/margins": 7.6484375, + "rewards/rejected": -5.47265625, + "step": 3358 + }, + { + "epoch": 0.6665013145493328, + "grad_norm": 27.188563440888437, + "learning_rate": 3.723366294607232e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 3.77734375, + "logps/chosen": -1176.0, + "logps/rejected": -877.0, + "loss": 0.1969, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.41015625, + "rewards/margins": 9.6953125, + "rewards/rejected": -6.28125, + "step": 3359 + }, + { + "epoch": 0.6666997370901334, + "grad_norm": 33.7168295864771, + "learning_rate": 3.720503246919023e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.4140625, + "logps/chosen": -955.0, + "logps/rejected": -729.0, + "loss": 0.5063, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.1328125, + "rewards/margins": 7.859375, + "rewards/rejected": -5.7265625, + "step": 3360 + }, + { + "epoch": 0.6668981596309341, + "grad_norm": 43.32980575932226, + "learning_rate": 3.7176410528237945e-07, + "logits/chosen": 3.76171875, + "logits/rejected": 3.6953125, + "logps/chosen": -1266.0, + "logps/rejected": -860.0, + "loss": 0.3197, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7421875, + "rewards/margins": 10.0078125, + "rewards/rejected": -7.296875, + "step": 3361 + }, + { + "epoch": 0.6670965821717347, + "grad_norm": 28.324652371518155, + "learning_rate": 3.7147797136944934e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.69140625, + "logps/chosen": -925.5, + "logps/rejected": -711.0, + "loss": 0.3674, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.390625, + "rewards/margins": 7.8359375, + "rewards/rejected": -4.4453125, + "step": 3362 + }, + { + "epoch": 0.6672950047125353, + "grad_norm": 28.730111351601426, + "learning_rate": 3.711919230903652e-07, + "logits/chosen": 4.0, + "logits/rejected": 3.91796875, + "logps/chosen": -825.0, + "logps/rejected": -588.5, + "loss": 0.3703, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.724609375, + "rewards/margins": 7.9921875, + "rewards/rejected": -5.28125, + "step": 3363 + }, + { + "epoch": 0.6674934272533359, + "grad_norm": 30.50449639607032, + "learning_rate": 3.7090596058233927e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 4.1875, + "logps/chosen": -576.0, + "logps/rejected": -690.0, + "loss": 0.3947, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.267578125, + "rewards/margins": 6.78125, + "rewards/rejected": -4.51953125, + "step": 3364 + }, + { + "epoch": 0.6676918497941366, + "grad_norm": 31.93329399264854, + "learning_rate": 3.7062008398254287e-07, + "logits/chosen": 4.51953125, + "logits/rejected": 4.43359375, + "logps/chosen": -1265.0, + "logps/rejected": -973.0, + "loss": 0.4135, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5390625, + "rewards/margins": 8.703125, + "rewards/rejected": -6.171875, + "step": 3365 + }, + { + "epoch": 0.6678902723349373, + "grad_norm": 26.260004639086663, + "learning_rate": 3.703342934281056e-07, + "logits/chosen": 3.51953125, + "logits/rejected": 3.578125, + "logps/chosen": -1095.0, + "logps/rejected": -850.0, + "loss": 0.3832, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.20703125, + "rewards/margins": 8.67578125, + "rewards/rejected": -6.453125, + "step": 3366 + }, + { + "epoch": 0.6680886948757379, + "grad_norm": 39.27438714134266, + "learning_rate": 3.700485890561167e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 3.87890625, + "logps/chosen": -1114.0, + "logps/rejected": -1549.5, + "loss": 0.3776, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.1142578125, + "rewards/margins": 11.078125, + "rewards/rejected": -8.9921875, + "step": 3367 + }, + { + "epoch": 0.6682871174165386, + "grad_norm": 32.141382493372404, + "learning_rate": 3.6976297100362273e-07, + "logits/chosen": 3.78125, + "logits/rejected": 3.84375, + "logps/chosen": -593.0, + "logps/rejected": -541.0, + "loss": 0.5289, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.640625, + "rewards/margins": 5.6171875, + "rewards/rejected": -3.97265625, + "step": 3368 + }, + { + "epoch": 0.6684855399573392, + "grad_norm": 30.554304595646926, + "learning_rate": 3.694774394076302e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 3.94921875, + "logps/chosen": -1056.0, + "logps/rejected": -764.0, + "loss": 0.3757, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6328125, + "rewards/margins": 8.63671875, + "rewards/rejected": -6.0078125, + "step": 3369 + }, + { + "epoch": 0.6686839624981398, + "grad_norm": 26.784336835773214, + "learning_rate": 3.6919199440510316e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 3.76953125, + "logps/chosen": -1038.0, + "logps/rejected": -707.0, + "loss": 0.4248, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.5859375, + "rewards/margins": 8.796875, + "rewards/rejected": -5.203125, + "step": 3370 + }, + { + "epoch": 0.6688823850389404, + "grad_norm": 39.24058729672231, + "learning_rate": 3.689066361329649e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 3.73046875, + "logps/chosen": -839.0, + "logps/rejected": -1133.5, + "loss": 0.4761, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.673828125, + "rewards/margins": 7.96875, + "rewards/rejected": -6.2890625, + "step": 3371 + }, + { + "epoch": 0.6690808075797411, + "grad_norm": 28.804095218629584, + "learning_rate": 3.6862136472809623e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 3.78515625, + "logps/chosen": -1067.0, + "logps/rejected": -710.0, + "loss": 0.2857, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.84375, + "rewards/margins": 8.3203125, + "rewards/rejected": -5.46875, + "step": 3372 + }, + { + "epoch": 0.6692792301205417, + "grad_norm": 26.910289250887974, + "learning_rate": 3.6833618032733704e-07, + "logits/chosen": 3.73046875, + "logits/rejected": 3.890625, + "logps/chosen": -1039.0, + "logps/rejected": -664.0, + "loss": 0.2401, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.49609375, + "rewards/margins": 10.21875, + "rewards/rejected": -6.71484375, + "step": 3373 + }, + { + "epoch": 0.6694776526613423, + "grad_norm": 37.259263135379896, + "learning_rate": 3.680510830674852e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 4.07421875, + "logps/chosen": -1205.0, + "logps/rejected": -1545.0, + "loss": 0.3208, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.826171875, + "rewards/margins": 12.484375, + "rewards/rejected": -9.6484375, + "step": 3374 + }, + { + "epoch": 0.669676075202143, + "grad_norm": 34.58899461113606, + "learning_rate": 3.677660730852968e-07, + "logits/chosen": 3.7421875, + "logits/rejected": 3.7109375, + "logps/chosen": -606.0, + "logps/rejected": -577.5, + "loss": 0.6265, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.9716796875, + "rewards/margins": 4.25, + "rewards/rejected": -3.279296875, + "step": 3375 + }, + { + "epoch": 0.6698744977429436, + "grad_norm": 27.9970763369221, + "learning_rate": 3.674811505174863e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.8984375, + "logps/chosen": -1700.0, + "logps/rejected": -850.0, + "loss": 0.3193, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.416015625, + "rewards/margins": 9.203125, + "rewards/rejected": -7.78125, + "step": 3376 + }, + { + "epoch": 0.6700729202837442, + "grad_norm": 26.657002600880723, + "learning_rate": 3.671963155007255e-07, + "logits/chosen": 3.73046875, + "logits/rejected": 3.76953125, + "logps/chosen": -946.0, + "logps/rejected": -1083.0, + "loss": 0.2727, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2109375, + "rewards/margins": 10.71875, + "rewards/rejected": -7.5234375, + "step": 3377 + }, + { + "epoch": 0.6702713428245448, + "grad_norm": 28.654080351656003, + "learning_rate": 3.6691156817164516e-07, + "logits/chosen": 3.859375, + "logits/rejected": 4.0625, + "logps/chosen": -972.0, + "logps/rejected": -725.0, + "loss": 0.3248, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.609375, + "rewards/margins": 8.0234375, + "rewards/rejected": -5.421875, + "step": 3378 + }, + { + "epoch": 0.6704697653653455, + "grad_norm": 37.73247666301195, + "learning_rate": 3.666269086668331e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.17578125, + "logps/chosen": -841.0, + "logps/rejected": -612.5, + "loss": 0.5261, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3046875, + "rewards/margins": 6.38671875, + "rewards/rejected": -4.078125, + "step": 3379 + }, + { + "epoch": 0.6706681879061461, + "grad_norm": 26.680662106874017, + "learning_rate": 3.663423371228357e-07, + "logits/chosen": 3.90625, + "logits/rejected": 4.34375, + "logps/chosen": -1001.0, + "logps/rejected": -1289.0, + "loss": 0.4392, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.44140625, + "rewards/margins": 10.203125, + "rewards/rejected": -7.75390625, + "step": 3380 + }, + { + "epoch": 0.6708666104469467, + "grad_norm": 27.005296610572987, + "learning_rate": 3.6605785367615705e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 3.97265625, + "logps/chosen": -976.0, + "logps/rejected": -647.5, + "loss": 0.3707, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9140625, + "rewards/margins": 8.765625, + "rewards/rejected": -5.85546875, + "step": 3381 + }, + { + "epoch": 0.6710650329877474, + "grad_norm": 27.6060812235599, + "learning_rate": 3.6577345846325833e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.0859375, + "logps/chosen": -909.0, + "logps/rejected": -747.0, + "loss": 0.5002, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.1376953125, + "rewards/margins": 6.4140625, + "rewards/rejected": -4.27734375, + "step": 3382 + }, + { + "epoch": 0.671263455528548, + "grad_norm": 35.228139126895165, + "learning_rate": 3.654891516205593e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 3.78125, + "logps/chosen": -740.0, + "logps/rejected": -443.0, + "loss": 0.447, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.578125, + "rewards/margins": 5.47265625, + "rewards/rejected": -3.890625, + "step": 3383 + }, + { + "epoch": 0.6714618780693486, + "grad_norm": 33.80627215967977, + "learning_rate": 3.652049332844366e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 3.72265625, + "logps/chosen": -1287.0, + "logps/rejected": -702.5, + "loss": 0.415, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.0732421875, + "rewards/margins": 7.1640625, + "rewards/rejected": -5.10546875, + "step": 3384 + }, + { + "epoch": 0.6716603006101494, + "grad_norm": 32.059400970321256, + "learning_rate": 3.649208035912249e-07, + "logits/chosen": 3.85546875, + "logits/rejected": 3.671875, + "logps/chosen": -1187.0, + "logps/rejected": -826.0, + "loss": 0.3279, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.671875, + "rewards/margins": 8.2734375, + "rewards/rejected": -5.5703125, + "step": 3385 + }, + { + "epoch": 0.67185872315095, + "grad_norm": 31.762636300542837, + "learning_rate": 3.64636762677216e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 3.99609375, + "logps/chosen": -1152.0, + "logps/rejected": -2224.0, + "loss": 0.4679, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.087890625, + "rewards/margins": 12.125, + "rewards/rejected": -10.01171875, + "step": 3386 + }, + { + "epoch": 0.6720571456917506, + "grad_norm": 32.420765919700706, + "learning_rate": 3.643528106786595e-07, + "logits/chosen": 3.78125, + "logits/rejected": 3.8046875, + "logps/chosen": -1016.5, + "logps/rejected": -689.0, + "loss": 0.4219, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.56640625, + "rewards/margins": 7.49609375, + "rewards/rejected": -4.93359375, + "step": 3387 + }, + { + "epoch": 0.6722555682325512, + "grad_norm": 33.864745682111135, + "learning_rate": 3.64068947731762e-07, + "logits/chosen": 4.5078125, + "logits/rejected": 4.4609375, + "logps/chosen": -948.0, + "logps/rejected": -710.0, + "loss": 0.5778, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.6572265625, + "rewards/margins": 6.1796875, + "rewards/rejected": -4.53125, + "step": 3388 + }, + { + "epoch": 0.6724539907733519, + "grad_norm": 35.19571583095833, + "learning_rate": 3.637851739726874e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 3.79296875, + "logps/chosen": -760.0, + "logps/rejected": -796.0, + "loss": 0.3854, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.2578125, + "rewards/margins": 6.5859375, + "rewards/rejected": -4.3359375, + "step": 3389 + }, + { + "epoch": 0.6726524133141525, + "grad_norm": 34.23026694249068, + "learning_rate": 3.635014895375572e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 3.6484375, + "logps/chosen": -694.0, + "logps/rejected": -539.0, + "loss": 0.3575, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.32421875, + "rewards/margins": 7.4375, + "rewards/rejected": -5.1171875, + "step": 3390 + }, + { + "epoch": 0.6728508358549531, + "grad_norm": 35.8890140878885, + "learning_rate": 3.6321789456244944e-07, + "logits/chosen": 3.3671875, + "logits/rejected": 3.6796875, + "logps/chosen": -1129.0, + "logps/rejected": -910.0, + "loss": 0.5162, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.75, + "rewards/margins": 7.55859375, + "rewards/rejected": -4.796875, + "step": 3391 + }, + { + "epoch": 0.6730492583957538, + "grad_norm": 32.935483616964575, + "learning_rate": 3.629343891834001e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.1953125, + "logps/chosen": -1046.0, + "logps/rejected": -930.0, + "loss": 0.3133, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.05859375, + "rewards/margins": 9.171875, + "rewards/rejected": -6.1015625, + "step": 3392 + }, + { + "epoch": 0.6732476809365544, + "grad_norm": 34.47300371880856, + "learning_rate": 3.6265097353640115e-07, + "logits/chosen": 3.4453125, + "logits/rejected": 3.5546875, + "logps/chosen": -818.0, + "logps/rejected": -774.0, + "loss": 0.4207, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.400390625, + "rewards/margins": 7.30859375, + "rewards/rejected": -4.9169921875, + "step": 3393 + }, + { + "epoch": 0.673446103477355, + "grad_norm": 29.542983412290248, + "learning_rate": 3.623676477574025e-07, + "logits/chosen": 4.15625, + "logits/rejected": 3.96875, + "logps/chosen": -1025.0, + "logps/rejected": -1641.0, + "loss": 0.3543, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.87890625, + "rewards/margins": 12.3203125, + "rewards/rejected": -9.453125, + "step": 3394 + }, + { + "epoch": 0.6736445260181556, + "grad_norm": 36.83494484733378, + "learning_rate": 3.6208441198230997e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 3.9296875, + "logps/chosen": -1793.0, + "logps/rejected": -687.0, + "loss": 0.5667, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.82080078125, + "rewards/margins": 4.0390625, + "rewards/rejected": -4.8515625, + "step": 3395 + }, + { + "epoch": 0.6738429485589563, + "grad_norm": 31.39027902440811, + "learning_rate": 3.6180126634698716e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 3.72265625, + "logps/chosen": -1084.0, + "logps/rejected": -738.0, + "loss": 0.4228, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.16796875, + "rewards/margins": 6.5703125, + "rewards/rejected": -4.3984375, + "step": 3396 + }, + { + "epoch": 0.6740413710997569, + "grad_norm": 29.483315579386144, + "learning_rate": 3.615182109872539e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 3.94140625, + "logps/chosen": -1285.0, + "logps/rejected": -729.0, + "loss": 0.295, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.61328125, + "rewards/margins": 10.546875, + "rewards/rejected": -6.921875, + "step": 3397 + }, + { + "epoch": 0.6742397936405575, + "grad_norm": 31.811286413562268, + "learning_rate": 3.612352460388866e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 3.90625, + "logps/chosen": -1001.0, + "logps/rejected": -1683.0, + "loss": 0.3465, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.009765625, + "rewards/margins": 11.0390625, + "rewards/rejected": -8.03125, + "step": 3398 + }, + { + "epoch": 0.6744382161813582, + "grad_norm": 32.82552807723143, + "learning_rate": 3.609523716376189e-07, + "logits/chosen": 3.65234375, + "logits/rejected": 3.62890625, + "logps/chosen": -971.0, + "logps/rejected": -652.5, + "loss": 0.3376, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.671875, + "rewards/margins": 7.671875, + "rewards/rejected": -4.9921875, + "step": 3399 + }, + { + "epoch": 0.6746366387221588, + "grad_norm": 25.147764531158117, + "learning_rate": 3.606695879191403e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 3.640625, + "logps/chosen": -1151.0, + "logps/rejected": -658.0, + "loss": 0.4722, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.99951171875, + "rewards/margins": 8.6171875, + "rewards/rejected": -5.61328125, + "step": 3400 + }, + { + "epoch": 0.6748350612629594, + "grad_norm": 37.714179174965906, + "learning_rate": 3.6038689501909723e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 4.15234375, + "logps/chosen": -772.0, + "logps/rejected": -1173.0, + "loss": 0.4505, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.896484375, + "rewards/margins": 10.3203125, + "rewards/rejected": -8.421875, + "step": 3401 + }, + { + "epoch": 0.6750334838037602, + "grad_norm": 33.08880422440216, + "learning_rate": 3.6010429307309255e-07, + "logits/chosen": 3.7109375, + "logits/rejected": 3.578125, + "logps/chosen": -1040.5, + "logps/rejected": -677.5, + "loss": 0.4332, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.40625, + "rewards/margins": 7.03125, + "rewards/rejected": -4.61328125, + "step": 3402 + }, + { + "epoch": 0.6752319063445608, + "grad_norm": 38.22435861072971, + "learning_rate": 3.5982178221668533e-07, + "logits/chosen": 3.83984375, + "logits/rejected": 3.5078125, + "logps/chosen": -1146.0, + "logps/rejected": -863.0, + "loss": 0.3524, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.283203125, + "rewards/margins": 9.6171875, + "rewards/rejected": -6.328125, + "step": 3403 + }, + { + "epoch": 0.6754303288853614, + "grad_norm": 34.96873471279592, + "learning_rate": 3.5953936258539095e-07, + "logits/chosen": 4.59375, + "logits/rejected": 4.203125, + "logps/chosen": -1233.0, + "logps/rejected": -776.5, + "loss": 0.3885, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.462890625, + "rewards/margins": 7.8984375, + "rewards/rejected": -5.4140625, + "step": 3404 + }, + { + "epoch": 0.675628751426162, + "grad_norm": 32.252784164118154, + "learning_rate": 3.5925703431468104e-07, + "logits/chosen": 3.734375, + "logits/rejected": 3.84765625, + "logps/chosen": -931.0, + "logps/rejected": -1318.5, + "loss": 0.4569, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.8212890625, + "rewards/margins": 9.203125, + "rewards/rejected": -7.375, + "step": 3405 + }, + { + "epoch": 0.6758271739669627, + "grad_norm": 41.958664610421536, + "learning_rate": 3.5897479753998385e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 3.8828125, + "logps/chosen": -1240.0, + "logps/rejected": -1001.0, + "loss": 0.3262, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.87109375, + "rewards/margins": 10.2890625, + "rewards/rejected": -7.4140625, + "step": 3406 + }, + { + "epoch": 0.6760255965077633, + "grad_norm": 28.552869237151146, + "learning_rate": 3.5869265239668255e-07, + "logits/chosen": 4.328125, + "logits/rejected": 4.15234375, + "logps/chosen": -1122.0, + "logps/rejected": -773.0, + "loss": 0.4378, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.341796875, + "rewards/margins": 6.890625, + "rewards/rejected": -4.5234375, + "step": 3407 + }, + { + "epoch": 0.6762240190485639, + "grad_norm": 29.12830438846341, + "learning_rate": 3.5841059902011803e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.0390625, + "logps/chosen": -1004.0, + "logps/rejected": -692.0, + "loss": 0.307, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.58984375, + "rewards/margins": 9.0546875, + "rewards/rejected": -6.4609375, + "step": 3408 + }, + { + "epoch": 0.6764224415893646, + "grad_norm": 25.957149552894517, + "learning_rate": 3.5812863754558586e-07, + "logits/chosen": 4.140625, + "logits/rejected": 3.9453125, + "logps/chosen": -1082.0, + "logps/rejected": -556.0, + "loss": 0.2278, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.322265625, + "rewards/margins": 10.03125, + "rewards/rejected": -6.703125, + "step": 3409 + }, + { + "epoch": 0.6766208641301652, + "grad_norm": 27.05624459506127, + "learning_rate": 3.578467681083379e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 4.26171875, + "logps/chosen": -1010.0, + "logps/rejected": -715.0, + "loss": 0.3911, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.068359375, + "rewards/margins": 8.65625, + "rewards/rejected": -5.58984375, + "step": 3410 + }, + { + "epoch": 0.6768192866709658, + "grad_norm": 35.29527819480033, + "learning_rate": 3.5756499084358206e-07, + "logits/chosen": 3.75390625, + "logits/rejected": 3.875, + "logps/chosen": -1097.0, + "logps/rejected": -786.0, + "loss": 0.3419, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8125, + "rewards/margins": 9.296875, + "rewards/rejected": -6.4765625, + "step": 3411 + }, + { + "epoch": 0.6770177092117664, + "grad_norm": 28.224096797172614, + "learning_rate": 3.572833058864817e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.0234375, + "logps/chosen": -1020.0, + "logps/rejected": -981.0, + "loss": 0.2881, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.4453125, + "rewards/margins": 14.40625, + "rewards/rejected": -10.953125, + "step": 3412 + }, + { + "epoch": 0.6772161317525671, + "grad_norm": 34.688380562999775, + "learning_rate": 3.5700171337215647e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.203125, + "logps/chosen": -841.0, + "logps/rejected": -731.0, + "loss": 0.5006, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.197265625, + "rewards/margins": 7.5859375, + "rewards/rejected": -5.390625, + "step": 3413 + }, + { + "epoch": 0.6774145542933677, + "grad_norm": 35.82581565761037, + "learning_rate": 3.5672021343568094e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.3359375, + "logps/chosen": -935.0, + "logps/rejected": -1341.0, + "loss": 0.5041, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.875, + "rewards/margins": 7.875, + "rewards/rejected": -5.98828125, + "step": 3414 + }, + { + "epoch": 0.6776129768341683, + "grad_norm": 24.76464512061306, + "learning_rate": 3.5643880621208566e-07, + "logits/chosen": 4.05078125, + "logits/rejected": 4.015625, + "logps/chosen": -1350.0, + "logps/rejected": -853.0, + "loss": 0.1925, + "rewards/accuracies": 0.96875, + "rewards/chosen": 4.359375, + "rewards/margins": 10.984375, + "rewards/rejected": -6.6484375, + "step": 3415 + }, + { + "epoch": 0.677811399374969, + "grad_norm": 28.136966563625478, + "learning_rate": 3.5615749183635693e-07, + "logits/chosen": 3.828125, + "logits/rejected": 3.9609375, + "logps/chosen": -739.0, + "logps/rejected": -681.0, + "loss": 0.2614, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.19140625, + "rewards/margins": 9.3671875, + "rewards/rejected": -6.1640625, + "step": 3416 + }, + { + "epoch": 0.6780098219157696, + "grad_norm": 29.43096611682, + "learning_rate": 3.5587627044343604e-07, + "logits/chosen": 4.6328125, + "logits/rejected": 4.2734375, + "logps/chosen": -1101.0, + "logps/rejected": -1101.0, + "loss": 0.301, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9765625, + "rewards/margins": 9.84375, + "rewards/rejected": -6.8671875, + "step": 3417 + }, + { + "epoch": 0.6782082444565702, + "grad_norm": 29.52494126758075, + "learning_rate": 3.5559514216821983e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 4.171875, + "logps/chosen": -1064.0, + "logps/rejected": -745.5, + "loss": 0.3585, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.625, + "rewards/margins": 8.8828125, + "rewards/rejected": -5.2734375, + "step": 3418 + }, + { + "epoch": 0.678406666997371, + "grad_norm": 32.72219295802627, + "learning_rate": 3.5531410714556086e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.03515625, + "logps/chosen": -800.0, + "logps/rejected": -635.5, + "loss": 0.5659, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.828125, + "rewards/margins": 5.375, + "rewards/rejected": -3.546875, + "step": 3419 + }, + { + "epoch": 0.6786050895381716, + "grad_norm": 26.129467411677084, + "learning_rate": 3.5503316551026616e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 3.74609375, + "logps/chosen": -1054.0, + "logps/rejected": -816.0, + "loss": 0.3958, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.58984375, + "rewards/margins": 7.765625, + "rewards/rejected": -5.19140625, + "step": 3420 + }, + { + "epoch": 0.6788035120789722, + "grad_norm": 23.185566130322794, + "learning_rate": 3.547523173970989e-07, + "logits/chosen": 4.046875, + "logits/rejected": 3.9921875, + "logps/chosen": -1212.0, + "logps/rejected": -1021.5, + "loss": 0.3296, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.93359375, + "rewards/margins": 9.2734375, + "rewards/rejected": -6.337890625, + "step": 3421 + }, + { + "epoch": 0.6790019346197728, + "grad_norm": 25.34709782474505, + "learning_rate": 3.5447156294077676e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.23046875, + "logps/chosen": -1354.0, + "logps/rejected": -938.0, + "loss": 0.2466, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3671875, + "rewards/margins": 10.859375, + "rewards/rejected": -7.4765625, + "step": 3422 + }, + { + "epoch": 0.6792003571605735, + "grad_norm": 44.91352768886536, + "learning_rate": 3.5419090227597235e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.2109375, + "logps/chosen": -905.0, + "logps/rejected": -733.0, + "loss": 0.5599, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.275390625, + "rewards/margins": 6.3125, + "rewards/rejected": -5.046875, + "step": 3423 + }, + { + "epoch": 0.6793987797013741, + "grad_norm": 21.736231795422757, + "learning_rate": 3.539103355373144e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.31640625, + "logps/chosen": -865.0, + "logps/rejected": -759.0, + "loss": 0.4262, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.32421875, + "rewards/margins": 8.140625, + "rewards/rejected": -5.8125, + "step": 3424 + }, + { + "epoch": 0.6795972022421747, + "grad_norm": 31.331230092893904, + "learning_rate": 3.536298628593849e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.34765625, + "logps/chosen": -1175.0, + "logps/rejected": -894.0, + "loss": 0.4123, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0, + "rewards/margins": 9.1640625, + "rewards/rejected": -6.1640625, + "step": 3425 + }, + { + "epoch": 0.6797956247829754, + "grad_norm": 30.02163528718561, + "learning_rate": 3.533494843767222e-07, + "logits/chosen": 3.51953125, + "logits/rejected": 3.21875, + "logps/chosen": -898.0, + "logps/rejected": -652.0, + "loss": 0.3533, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.83203125, + "rewards/margins": 9.0546875, + "rewards/rejected": -7.2109375, + "step": 3426 + }, + { + "epoch": 0.679994047323776, + "grad_norm": 28.760711228183336, + "learning_rate": 3.530692002238187e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 3.88671875, + "logps/chosen": -989.0, + "logps/rejected": -752.0, + "loss": 0.4868, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.74609375, + "rewards/margins": 8.15625, + "rewards/rejected": -5.42578125, + "step": 3427 + }, + { + "epoch": 0.6801924698645766, + "grad_norm": 30.756357827146562, + "learning_rate": 3.5278901053512146e-07, + "logits/chosen": 3.953125, + "logits/rejected": 3.9609375, + "logps/chosen": -828.0, + "logps/rejected": -2377.5, + "loss": 0.5007, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5546875, + "rewards/margins": 11.328125, + "rewards/rejected": -8.77734375, + "step": 3428 + }, + { + "epoch": 0.6803908924053772, + "grad_norm": 29.825398212658282, + "learning_rate": 3.5250891544503313e-07, + "logits/chosen": 4.4375, + "logits/rejected": 4.34375, + "logps/chosen": -1211.0, + "logps/rejected": -663.0, + "loss": 0.4556, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.0982666015625, + "rewards/margins": 8.0703125, + "rewards/rejected": -5.984375, + "step": 3429 + }, + { + "epoch": 0.6805893149461779, + "grad_norm": 34.461363084403686, + "learning_rate": 3.5222891508790974e-07, + "logits/chosen": 3.6015625, + "logits/rejected": 3.9453125, + "logps/chosen": -1135.0, + "logps/rejected": -1783.0, + "loss": 0.4108, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.259765625, + "rewards/margins": 12.1171875, + "rewards/rejected": -9.82421875, + "step": 3430 + }, + { + "epoch": 0.6807877374869785, + "grad_norm": 28.057002062596247, + "learning_rate": 3.5194900959806294e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.26953125, + "logps/chosen": -1528.0, + "logps/rejected": -863.0, + "loss": 0.2862, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.34765625, + "rewards/margins": 10.09375, + "rewards/rejected": -6.7578125, + "step": 3431 + }, + { + "epoch": 0.6809861600277791, + "grad_norm": 28.652457334090144, + "learning_rate": 3.5166919910975787e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.84375, + "logps/chosen": -1079.0, + "logps/rejected": -740.0, + "loss": 0.4532, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.587890625, + "rewards/margins": 7.54296875, + "rewards/rejected": -4.955078125, + "step": 3432 + }, + { + "epoch": 0.6811845825685798, + "grad_norm": 34.835728894704104, + "learning_rate": 3.513894837572152e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 3.9296875, + "logps/chosen": -807.0, + "logps/rejected": -569.0, + "loss": 0.4791, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.69921875, + "rewards/margins": 5.9921875, + "rewards/rejected": -4.2890625, + "step": 3433 + }, + { + "epoch": 0.6813830051093804, + "grad_norm": 36.36967551438041, + "learning_rate": 3.511098636746093e-07, + "logits/chosen": 4.09375, + "logits/rejected": 4.20703125, + "logps/chosen": -1056.0, + "logps/rejected": -852.0, + "loss": 0.5114, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.947265625, + "rewards/margins": 8.5703125, + "rewards/rejected": -5.625, + "step": 3434 + }, + { + "epoch": 0.681581427650181, + "grad_norm": 28.050010438769522, + "learning_rate": 3.5083033899606864e-07, + "logits/chosen": 4.6171875, + "logits/rejected": 4.28125, + "logps/chosen": -1256.0, + "logps/rejected": -874.0, + "loss": 0.3668, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.12890625, + "rewards/margins": 9.40625, + "rewards/rejected": -6.28125, + "step": 3435 + }, + { + "epoch": 0.6817798501909818, + "grad_norm": 25.94492570655743, + "learning_rate": 3.505509098556766e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 4.10546875, + "logps/chosen": -1057.0, + "logps/rejected": -860.0, + "loss": 0.5049, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.107421875, + "rewards/margins": 7.3359375, + "rewards/rejected": -6.23046875, + "step": 3436 + }, + { + "epoch": 0.6819782727317824, + "grad_norm": 34.01845606608354, + "learning_rate": 3.5027157638746995e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.12109375, + "logps/chosen": -827.0, + "logps/rejected": -666.0, + "loss": 0.4717, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.848876953125, + "rewards/margins": 6.7578125, + "rewards/rejected": -4.91015625, + "step": 3437 + }, + { + "epoch": 0.682176695272583, + "grad_norm": 32.029441687235035, + "learning_rate": 3.4999233872544056e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 3.76953125, + "logps/chosen": -986.5, + "logps/rejected": -610.0, + "loss": 0.3707, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.8603515625, + "rewards/margins": 8.0859375, + "rewards/rejected": -6.21875, + "step": 3438 + }, + { + "epoch": 0.6823751178133836, + "grad_norm": 32.105695004831816, + "learning_rate": 3.4971319700353343e-07, + "logits/chosen": 3.671875, + "logits/rejected": 3.44921875, + "logps/chosen": -1168.0, + "logps/rejected": -847.0, + "loss": 0.3176, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.408203125, + "rewards/margins": 13.6015625, + "rewards/rejected": -10.1953125, + "step": 3439 + }, + { + "epoch": 0.6825735403541843, + "grad_norm": 27.142974678936035, + "learning_rate": 3.4943415135564795e-07, + "logits/chosen": 3.875, + "logits/rejected": 4.1015625, + "logps/chosen": -1007.5, + "logps/rejected": -675.5, + "loss": 0.3633, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.1552734375, + "rewards/margins": 8.5, + "rewards/rejected": -6.3359375, + "step": 3440 + }, + { + "epoch": 0.6827719628949849, + "grad_norm": 30.512019344174465, + "learning_rate": 3.491552019156374e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.30078125, + "logps/chosen": -956.0, + "logps/rejected": -865.0, + "loss": 0.4466, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.0615234375, + "rewards/margins": 8.7890625, + "rewards/rejected": -6.73046875, + "step": 3441 + }, + { + "epoch": 0.6829703854357855, + "grad_norm": 38.71499258106303, + "learning_rate": 3.488763488173088e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.359375, + "logps/chosen": -1088.0, + "logps/rejected": -1667.0, + "loss": 0.339, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.33203125, + "rewards/margins": 10.359375, + "rewards/rejected": -8.02734375, + "step": 3442 + }, + { + "epoch": 0.6831688079765862, + "grad_norm": 31.557989343584953, + "learning_rate": 3.4859759219442285e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.91015625, + "logps/chosen": -1030.0, + "logps/rejected": -901.5, + "loss": 0.3944, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.248046875, + "rewards/margins": 10.671875, + "rewards/rejected": -8.421875, + "step": 3443 + }, + { + "epoch": 0.6833672305173868, + "grad_norm": 36.103062281389626, + "learning_rate": 3.4831893218069466e-07, + "logits/chosen": 3.9375, + "logits/rejected": 4.04296875, + "logps/chosen": -1013.0, + "logps/rejected": -687.0, + "loss": 0.3734, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.845703125, + "rewards/margins": 8.546875, + "rewards/rejected": -5.69140625, + "step": 3444 + }, + { + "epoch": 0.6835656530581874, + "grad_norm": 29.720293275672137, + "learning_rate": 3.4804036890979205e-07, + "logits/chosen": 4.7421875, + "logits/rejected": 4.640625, + "logps/chosen": -1148.0, + "logps/rejected": -861.0, + "loss": 0.3678, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.841796875, + "rewards/margins": 10.28125, + "rewards/rejected": -7.453125, + "step": 3445 + }, + { + "epoch": 0.683764075598988, + "grad_norm": 31.67777335281669, + "learning_rate": 3.4776190251533707e-07, + "logits/chosen": 3.65625, + "logits/rejected": 3.75, + "logps/chosen": -1039.0, + "logps/rejected": -889.0, + "loss": 0.3558, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.54296875, + "rewards/margins": 9.7578125, + "rewards/rejected": -7.2265625, + "step": 3446 + }, + { + "epoch": 0.6839624981397887, + "grad_norm": 26.208298279939328, + "learning_rate": 3.4748353313090496e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 4.27734375, + "logps/chosen": -1108.0, + "logps/rejected": -862.5, + "loss": 0.3233, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1015625, + "rewards/margins": 9.0859375, + "rewards/rejected": -5.9921875, + "step": 3447 + }, + { + "epoch": 0.6841609206805893, + "grad_norm": 28.442160473499893, + "learning_rate": 3.4720526089002454e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.32421875, + "logps/chosen": -987.0, + "logps/rejected": -675.0, + "loss": 0.6222, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.9765625, + "rewards/margins": 5.40625, + "rewards/rejected": -3.4296875, + "step": 3448 + }, + { + "epoch": 0.6843593432213899, + "grad_norm": 30.383971758880627, + "learning_rate": 3.4692708592617855e-07, + "logits/chosen": 4.15625, + "logits/rejected": 3.96875, + "logps/chosen": -921.0, + "logps/rejected": -619.5, + "loss": 0.4516, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.09375, + "rewards/margins": 5.90625, + "rewards/rejected": -3.8125, + "step": 3449 + }, + { + "epoch": 0.6845577657621906, + "grad_norm": 27.315768911029075, + "learning_rate": 3.4664900837280186e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.64453125, + "logps/chosen": -620.0, + "logps/rejected": -396.5, + "loss": 0.3712, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.56640625, + "rewards/margins": 5.9140625, + "rewards/rejected": -3.3515625, + "step": 3450 + }, + { + "epoch": 0.6847561883029912, + "grad_norm": 26.84394128805669, + "learning_rate": 3.4637102836328393e-07, + "logits/chosen": 3.47265625, + "logits/rejected": 3.37890625, + "logps/chosen": -888.0, + "logps/rejected": -646.0, + "loss": 0.4174, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9462890625, + "rewards/margins": 6.4609375, + "rewards/rejected": -4.509765625, + "step": 3451 + }, + { + "epoch": 0.6849546108437918, + "grad_norm": 27.2932110085492, + "learning_rate": 3.460931460309663e-07, + "logits/chosen": 4.34765625, + "logits/rejected": 4.16796875, + "logps/chosen": -940.5, + "logps/rejected": -691.5, + "loss": 0.418, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.439453125, + "rewards/margins": 7.890625, + "rewards/rejected": -5.4609375, + "step": 3452 + }, + { + "epoch": 0.6851530333845924, + "grad_norm": 40.354547352770716, + "learning_rate": 3.458153615091447e-07, + "logits/chosen": 3.734375, + "logits/rejected": 3.83984375, + "logps/chosen": -958.0, + "logps/rejected": -1024.0, + "loss": 0.428, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.982421875, + "rewards/margins": 7.859375, + "rewards/rejected": -5.87109375, + "step": 3453 + }, + { + "epoch": 0.6853514559253931, + "grad_norm": 31.757922151318333, + "learning_rate": 3.4553767493106713e-07, + "logits/chosen": 3.71484375, + "logits/rejected": 3.9296875, + "logps/chosen": -860.5, + "logps/rejected": -700.0, + "loss": 0.4758, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.67578125, + "rewards/margins": 5.56640625, + "rewards/rejected": -2.8935546875, + "step": 3454 + }, + { + "epoch": 0.6855498784661938, + "grad_norm": 39.647269811284325, + "learning_rate": 3.4526008642993476e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.21484375, + "logps/chosen": -1257.0, + "logps/rejected": -937.0, + "loss": 0.5913, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.6171875, + "rewards/margins": 6.46484375, + "rewards/rejected": -3.83984375, + "step": 3455 + }, + { + "epoch": 0.6857483010069944, + "grad_norm": 26.54577317875191, + "learning_rate": 3.4498259613890245e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.94921875, + "logps/chosen": -785.0, + "logps/rejected": -663.5, + "loss": 0.2543, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.77734375, + "rewards/margins": 8.96875, + "rewards/rejected": -6.1875, + "step": 3456 + }, + { + "epoch": 0.6859467235477951, + "grad_norm": 33.00574161497426, + "learning_rate": 3.4470520419107664e-07, + "logits/chosen": 3.7109375, + "logits/rejected": 3.6484375, + "logps/chosen": -1181.0, + "logps/rejected": -809.5, + "loss": 0.4576, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.83203125, + "rewards/margins": 6.806640625, + "rewards/rejected": -5.96484375, + "step": 3457 + }, + { + "epoch": 0.6861451460885957, + "grad_norm": 31.102645374988064, + "learning_rate": 3.4442791071951786e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.30859375, + "logps/chosen": -1279.0, + "logps/rejected": -917.0, + "loss": 0.3174, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.169921875, + "rewards/margins": 9.4453125, + "rewards/rejected": -6.2734375, + "step": 3458 + }, + { + "epoch": 0.6863435686293963, + "grad_norm": 31.80693951405002, + "learning_rate": 3.4415071585723866e-07, + "logits/chosen": 3.76171875, + "logits/rejected": 3.66796875, + "logps/chosen": -740.0, + "logps/rejected": -694.0, + "loss": 0.4978, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.5634765625, + "rewards/margins": 6.046875, + "rewards/rejected": -4.47265625, + "step": 3459 + }, + { + "epoch": 0.686541991170197, + "grad_norm": 35.32513029902036, + "learning_rate": 3.438736197372044e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 3.87890625, + "logps/chosen": -1148.0, + "logps/rejected": -851.0, + "loss": 0.4949, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.818359375, + "rewards/margins": 8.65625, + "rewards/rejected": -5.84375, + "step": 3460 + }, + { + "epoch": 0.6867404137109976, + "grad_norm": 36.19321148477516, + "learning_rate": 3.4359662249233357e-07, + "logits/chosen": 4.3515625, + "logits/rejected": 4.375, + "logps/chosen": -1057.5, + "logps/rejected": -712.5, + "loss": 0.4692, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.66162109375, + "rewards/margins": 6.15234375, + "rewards/rejected": -3.48046875, + "step": 3461 + }, + { + "epoch": 0.6869388362517982, + "grad_norm": 23.20117223494198, + "learning_rate": 3.4331972425549614e-07, + "logits/chosen": 4.6328125, + "logits/rejected": 4.49609375, + "logps/chosen": -1091.0, + "logps/rejected": -645.0, + "loss": 0.4341, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.912109375, + "rewards/margins": 8.171875, + "rewards/rejected": -4.26171875, + "step": 3462 + }, + { + "epoch": 0.6871372587925988, + "grad_norm": 26.60807322539934, + "learning_rate": 3.430429251595159e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.234375, + "logps/chosen": -771.0, + "logps/rejected": -855.0, + "loss": 0.4709, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3671875, + "rewards/margins": 7.1015625, + "rewards/rejected": -4.73828125, + "step": 3463 + }, + { + "epoch": 0.6873356813333995, + "grad_norm": 33.75787899068763, + "learning_rate": 3.427662253371682e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 4.06640625, + "logps/chosen": -858.5, + "logps/rejected": -824.0, + "loss": 0.4603, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9033203125, + "rewards/margins": 7.546875, + "rewards/rejected": -5.64453125, + "step": 3464 + }, + { + "epoch": 0.6875341038742001, + "grad_norm": 25.75869504624708, + "learning_rate": 3.4248962492118105e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.8046875, + "logps/chosen": -895.0, + "logps/rejected": -846.5, + "loss": 0.421, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.646484375, + "rewards/margins": 8.6640625, + "rewards/rejected": -7.01953125, + "step": 3465 + }, + { + "epoch": 0.6877325264150007, + "grad_norm": 26.74143216775426, + "learning_rate": 3.4221312404423486e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 4.27734375, + "logps/chosen": -708.0, + "logps/rejected": -600.0, + "loss": 0.5725, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.9208984375, + "rewards/margins": 6.068359375, + "rewards/rejected": -4.150634765625, + "step": 3466 + }, + { + "epoch": 0.6879309489558014, + "grad_norm": 30.368478709903723, + "learning_rate": 3.419367228389619e-07, + "logits/chosen": 3.6484375, + "logits/rejected": 3.37890625, + "logps/chosen": -875.0, + "logps/rejected": -744.5, + "loss": 0.5051, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.580078125, + "rewards/margins": 6.50390625, + "rewards/rejected": -4.923828125, + "step": 3467 + }, + { + "epoch": 0.688129371496602, + "grad_norm": 22.84105895838829, + "learning_rate": 3.4166042143794705e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.609375, + "logps/chosen": -1268.0, + "logps/rejected": -1666.0, + "loss": 0.3281, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.33984375, + "rewards/margins": 12.203125, + "rewards/rejected": -8.890625, + "step": 3468 + }, + { + "epoch": 0.6883277940374026, + "grad_norm": 32.161421735384195, + "learning_rate": 3.413842199737275e-07, + "logits/chosen": 3.9375, + "logits/rejected": 3.86328125, + "logps/chosen": -1178.0, + "logps/rejected": -1013.5, + "loss": 0.4241, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.408203125, + "rewards/margins": 8.171875, + "rewards/rejected": -5.75, + "step": 3469 + }, + { + "epoch": 0.6885262165782032, + "grad_norm": 25.994315083724736, + "learning_rate": 3.411081185787918e-07, + "logits/chosen": 4.37109375, + "logits/rejected": 4.4140625, + "logps/chosen": -854.0, + "logps/rejected": -777.0, + "loss": 0.3377, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.087890625, + "rewards/margins": 9.4609375, + "rewards/rejected": -6.3828125, + "step": 3470 + }, + { + "epoch": 0.688724639119004, + "grad_norm": 32.41821973367615, + "learning_rate": 3.4083211738558106e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.52734375, + "logps/chosen": -992.0, + "logps/rejected": -692.0, + "loss": 0.4265, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.09375, + "rewards/margins": 7.26953125, + "rewards/rejected": -5.18359375, + "step": 3471 + }, + { + "epoch": 0.6889230616598045, + "grad_norm": 32.634715609451135, + "learning_rate": 3.40556216526488e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.1640625, + "logps/chosen": -1045.0, + "logps/rejected": -846.0, + "loss": 0.3726, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.736328125, + "rewards/margins": 9.203125, + "rewards/rejected": -6.4609375, + "step": 3472 + }, + { + "epoch": 0.6891214842006051, + "grad_norm": 41.26332307612562, + "learning_rate": 3.402804161338577e-07, + "logits/chosen": 4.078125, + "logits/rejected": 4.08203125, + "logps/chosen": -1239.0, + "logps/rejected": -791.0, + "loss": 0.38, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.416015625, + "rewards/margins": 7.63671875, + "rewards/rejected": -5.220703125, + "step": 3473 + }, + { + "epoch": 0.6893199067414059, + "grad_norm": 36.231562473580965, + "learning_rate": 3.400047163399863e-07, + "logits/chosen": 4.51171875, + "logits/rejected": 4.3359375, + "logps/chosen": -1037.0, + "logps/rejected": -1485.0, + "loss": 0.4195, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9736328125, + "rewards/margins": 9.6015625, + "rewards/rejected": -7.6171875, + "step": 3474 + }, + { + "epoch": 0.6895183292822065, + "grad_norm": 26.575289862524247, + "learning_rate": 3.3972911727712203e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.171875, + "logps/chosen": -1035.0, + "logps/rejected": -652.5, + "loss": 0.4045, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1953125, + "rewards/margins": 8.125, + "rewards/rejected": -5.9453125, + "step": 3475 + }, + { + "epoch": 0.6897167518230071, + "grad_norm": 28.646130694314486, + "learning_rate": 3.3945361907746544e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.875, + "logps/chosen": -823.0, + "logps/rejected": -657.0, + "loss": 0.3764, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.81005859375, + "rewards/margins": 6.421875, + "rewards/rejected": -4.61328125, + "step": 3476 + }, + { + "epoch": 0.6899151743638078, + "grad_norm": 32.48428104168039, + "learning_rate": 3.391782218731677e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.1796875, + "logps/chosen": -973.0, + "logps/rejected": -840.0, + "loss": 0.3537, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.177734375, + "rewards/margins": 10.046875, + "rewards/rejected": -7.8671875, + "step": 3477 + }, + { + "epoch": 0.6901135969046084, + "grad_norm": 35.021988838202056, + "learning_rate": 3.3890292579633206e-07, + "logits/chosen": 3.76171875, + "logits/rejected": 3.828125, + "logps/chosen": -992.0, + "logps/rejected": -889.5, + "loss": 0.4215, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.041015625, + "rewards/margins": 8.3310546875, + "rewards/rejected": -7.30078125, + "step": 3478 + }, + { + "epoch": 0.690312019445409, + "grad_norm": 27.20464154030996, + "learning_rate": 3.3862773097901315e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.33203125, + "logps/chosen": -926.0, + "logps/rejected": -749.0, + "loss": 0.3345, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9921875, + "rewards/margins": 9.4765625, + "rewards/rejected": -6.484375, + "step": 3479 + }, + { + "epoch": 0.6905104419862096, + "grad_norm": 33.57567182709801, + "learning_rate": 3.3835263755321685e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.1875, + "logps/chosen": -1251.0, + "logps/rejected": -671.0, + "loss": 0.3264, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8828125, + "rewards/margins": 9.734375, + "rewards/rejected": -6.859375, + "step": 3480 + }, + { + "epoch": 0.6907088645270103, + "grad_norm": 34.49701669180244, + "learning_rate": 3.380776456509011e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 3.98046875, + "logps/chosen": -984.0, + "logps/rejected": -753.5, + "loss": 0.4169, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.85888671875, + "rewards/margins": 5.921875, + "rewards/rejected": -4.05859375, + "step": 3481 + }, + { + "epoch": 0.6909072870678109, + "grad_norm": 29.32503526397508, + "learning_rate": 3.378027554039739e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.08203125, + "logps/chosen": -1050.0, + "logps/rejected": -996.0, + "loss": 0.3272, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.478515625, + "rewards/margins": 9.4296875, + "rewards/rejected": -6.94140625, + "step": 3482 + }, + { + "epoch": 0.6911057096086115, + "grad_norm": 30.664312037586036, + "learning_rate": 3.3752796694429585e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.08203125, + "logps/chosen": -718.0, + "logps/rejected": -637.5, + "loss": 0.4958, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.5302734375, + "rewards/margins": 6.4140625, + "rewards/rejected": -4.87890625, + "step": 3483 + }, + { + "epoch": 0.6913041321494122, + "grad_norm": 39.57551983798334, + "learning_rate": 3.372532804036779e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 3.9375, + "logps/chosen": -929.0, + "logps/rejected": -1410.0, + "loss": 0.3654, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.533203125, + "rewards/margins": 9.1328125, + "rewards/rejected": -6.59375, + "step": 3484 + }, + { + "epoch": 0.6915025546902128, + "grad_norm": 27.686559413051608, + "learning_rate": 3.369786959138821e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 4.1171875, + "logps/chosen": -1068.0, + "logps/rejected": -976.0, + "loss": 0.36, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.28515625, + "rewards/margins": 8.53125, + "rewards/rejected": -5.25390625, + "step": 3485 + }, + { + "epoch": 0.6917009772310134, + "grad_norm": 29.196621393277574, + "learning_rate": 3.3670421360662215e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 3.94140625, + "logps/chosen": -1077.0, + "logps/rejected": -800.5, + "loss": 0.4373, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.78125, + "rewards/margins": 16.1328125, + "rewards/rejected": -13.40625, + "step": 3486 + }, + { + "epoch": 0.691899399771814, + "grad_norm": 17.922400482132083, + "learning_rate": 3.364298336135618e-07, + "logits/chosen": 4.6015625, + "logits/rejected": 4.6953125, + "logps/chosen": -992.0, + "logps/rejected": -789.0, + "loss": 0.3452, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.70703125, + "rewards/margins": 9.2734375, + "rewards/rejected": -6.578125, + "step": 3487 + }, + { + "epoch": 0.6920978223126147, + "grad_norm": 36.235287624029056, + "learning_rate": 3.3615555606631674e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 3.90625, + "logps/chosen": -1058.0, + "logps/rejected": -749.0, + "loss": 0.4107, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.60546875, + "rewards/margins": 8.0, + "rewards/rejected": -5.3828125, + "step": 3488 + }, + { + "epoch": 0.6922962448534153, + "grad_norm": 37.89184149149818, + "learning_rate": 3.358813810964528e-07, + "logits/chosen": 4.35546875, + "logits/rejected": 4.26953125, + "logps/chosen": -926.0, + "logps/rejected": -670.0, + "loss": 0.4524, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.326171875, + "rewards/margins": 17.2890625, + "rewards/rejected": -14.98046875, + "step": 3489 + }, + { + "epoch": 0.692494667394216, + "grad_norm": 34.866622639972746, + "learning_rate": 3.3560730883548694e-07, + "logits/chosen": 3.7734375, + "logits/rejected": 3.609375, + "logps/chosen": -815.0, + "logps/rejected": -753.0, + "loss": 0.268, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.40234375, + "rewards/margins": 8.7109375, + "rewards/rejected": -6.3125, + "step": 3490 + }, + { + "epoch": 0.6926930899350167, + "grad_norm": 34.598264028615674, + "learning_rate": 3.353333394148867e-07, + "logits/chosen": 4.4375, + "logits/rejected": 4.3046875, + "logps/chosen": -790.0, + "logps/rejected": -788.0, + "loss": 0.5045, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.806640625, + "rewards/margins": 6.96875, + "rewards/rejected": -5.17578125, + "step": 3491 + }, + { + "epoch": 0.6928915124758173, + "grad_norm": 39.12017461184794, + "learning_rate": 3.350594729660703e-07, + "logits/chosen": 4.515625, + "logits/rejected": 4.13671875, + "logps/chosen": -1178.0, + "logps/rejected": -744.0, + "loss": 0.3849, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.734375, + "rewards/margins": 7.90625, + "rewards/rejected": -5.1796875, + "step": 3492 + }, + { + "epoch": 0.6930899350166179, + "grad_norm": 32.76481425948304, + "learning_rate": 3.34785709620407e-07, + "logits/chosen": 3.8828125, + "logits/rejected": 4.0859375, + "logps/chosen": -1138.0, + "logps/rejected": -781.0, + "loss": 0.4361, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.85546875, + "rewards/margins": 7.40625, + "rewards/rejected": -4.5625, + "step": 3493 + }, + { + "epoch": 0.6932883575574186, + "grad_norm": 37.459976912112154, + "learning_rate": 3.345120495092154e-07, + "logits/chosen": 3.8828125, + "logits/rejected": 3.66796875, + "logps/chosen": -792.0, + "logps/rejected": -1248.0, + "loss": 0.3045, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.203125, + "rewards/margins": 11.0234375, + "rewards/rejected": -8.80859375, + "step": 3494 + }, + { + "epoch": 0.6934867800982192, + "grad_norm": 52.666764330061625, + "learning_rate": 3.3423849276376616e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 4.29296875, + "logps/chosen": -743.5, + "logps/rejected": -1095.0, + "loss": 0.4726, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8359375, + "rewards/margins": 9.6875, + "rewards/rejected": -7.859375, + "step": 3495 + }, + { + "epoch": 0.6936852026390198, + "grad_norm": 28.81124130226153, + "learning_rate": 3.3396503951527923e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 3.94921875, + "logps/chosen": -948.0, + "logps/rejected": -567.0, + "loss": 0.4533, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.561279296875, + "rewards/margins": 6.9453125, + "rewards/rejected": -4.380859375, + "step": 3496 + }, + { + "epoch": 0.6938836251798204, + "grad_norm": 32.480897944422914, + "learning_rate": 3.336916898949254e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.56640625, + "logps/chosen": -737.5, + "logps/rejected": -761.0, + "loss": 0.4138, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.33984375, + "rewards/margins": 8.515625, + "rewards/rejected": -6.16015625, + "step": 3497 + }, + { + "epoch": 0.6940820477206211, + "grad_norm": 32.77741915711343, + "learning_rate": 3.3341844403382557e-07, + "logits/chosen": 4.5390625, + "logits/rejected": 4.44140625, + "logps/chosen": -1139.0, + "logps/rejected": -837.0, + "loss": 0.4911, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.76171875, + "rewards/margins": 8.7421875, + "rewards/rejected": -5.984375, + "step": 3498 + }, + { + "epoch": 0.6942804702614217, + "grad_norm": 29.209484168764014, + "learning_rate": 3.3314530206305096e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.421875, + "logps/chosen": -636.0, + "logps/rejected": -1335.5, + "loss": 0.5834, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.74462890625, + "rewards/margins": 6.72265625, + "rewards/rejected": -5.97265625, + "step": 3499 + }, + { + "epoch": 0.6944788928022223, + "grad_norm": 32.333625503822425, + "learning_rate": 3.3287226411362274e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.80859375, + "logps/chosen": -1211.0, + "logps/rejected": -1252.0, + "loss": 0.3569, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.703125, + "rewards/margins": 10.46875, + "rewards/rejected": -7.7578125, + "step": 3500 + }, + { + "epoch": 0.694677315343023, + "grad_norm": 29.754299371540522, + "learning_rate": 3.3259933031651266e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 3.8671875, + "logps/chosen": -910.0, + "logps/rejected": -501.5, + "loss": 0.4664, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6474609375, + "rewards/margins": 6.5, + "rewards/rejected": -4.859375, + "step": 3501 + }, + { + "epoch": 0.6948757378838236, + "grad_norm": 28.262820518186402, + "learning_rate": 3.323265008026421e-07, + "logits/chosen": 3.8359375, + "logits/rejected": 4.046875, + "logps/chosen": -990.0, + "logps/rejected": -907.0, + "loss": 0.3342, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.779296875, + "rewards/margins": 11.0703125, + "rewards/rejected": -8.2890625, + "step": 3502 + }, + { + "epoch": 0.6950741604246242, + "grad_norm": 19.878468468306053, + "learning_rate": 3.3205377570288255e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.109375, + "logps/chosen": -718.0, + "logps/rejected": -535.5, + "loss": 0.4114, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4453125, + "rewards/margins": 7.171875, + "rewards/rejected": -4.71875, + "step": 3503 + }, + { + "epoch": 0.6952725829654248, + "grad_norm": 29.45701099890306, + "learning_rate": 3.3178115514805525e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 3.90625, + "logps/chosen": -1085.0, + "logps/rejected": -848.0, + "loss": 0.3574, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.517578125, + "rewards/margins": 7.921875, + "rewards/rejected": -5.390625, + "step": 3504 + }, + { + "epoch": 0.6954710055062255, + "grad_norm": 39.375555603884195, + "learning_rate": 3.3150863926893145e-07, + "logits/chosen": 3.91015625, + "logits/rejected": 3.87109375, + "logps/chosen": -748.0, + "logps/rejected": -687.0, + "loss": 0.4437, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.73828125, + "rewards/margins": 5.5546875, + "rewards/rejected": -3.8125, + "step": 3505 + }, + { + "epoch": 0.6956694280470261, + "grad_norm": 31.485605585669926, + "learning_rate": 3.312362281962326e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 4.18359375, + "logps/chosen": -1210.0, + "logps/rejected": -701.5, + "loss": 0.4347, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.27734375, + "rewards/margins": 6.9716796875, + "rewards/rejected": -5.6806640625, + "step": 3506 + }, + { + "epoch": 0.6958678505878267, + "grad_norm": 32.69628546391759, + "learning_rate": 3.3096392206062885e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 3.98828125, + "logps/chosen": -1328.0, + "logps/rejected": -1102.0, + "loss": 0.3085, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3984375, + "rewards/margins": 11.8984375, + "rewards/rejected": -8.4921875, + "step": 3507 + }, + { + "epoch": 0.6960662731286275, + "grad_norm": 27.217359596720378, + "learning_rate": 3.30691720992741e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 3.65625, + "logps/chosen": -1033.0, + "logps/rejected": -708.5, + "loss": 0.2813, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.85546875, + "rewards/margins": 9.4375, + "rewards/rejected": -6.578125, + "step": 3508 + }, + { + "epoch": 0.6962646956694281, + "grad_norm": 22.721512782590327, + "learning_rate": 3.3041962512313893e-07, + "logits/chosen": 4.14453125, + "logits/rejected": 4.01171875, + "logps/chosen": -1038.0, + "logps/rejected": -1875.5, + "loss": 0.3793, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.80859375, + "rewards/margins": 11.21875, + "rewards/rejected": -8.41015625, + "step": 3509 + }, + { + "epoch": 0.6964631182102287, + "grad_norm": 28.914616608456562, + "learning_rate": 3.3014763458234206e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.046875, + "logps/chosen": -935.0, + "logps/rejected": -770.0, + "loss": 0.4048, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.419921875, + "rewards/margins": 8.5234375, + "rewards/rejected": -5.1015625, + "step": 3510 + }, + { + "epoch": 0.6966615407510293, + "grad_norm": 32.002608707125745, + "learning_rate": 3.2987574950081995e-07, + "logits/chosen": 4.171875, + "logits/rejected": 3.96875, + "logps/chosen": -1202.0, + "logps/rejected": -1572.0, + "loss": 0.4316, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.91015625, + "rewards/margins": 10.765625, + "rewards/rejected": -7.8515625, + "step": 3511 + }, + { + "epoch": 0.69685996329183, + "grad_norm": 35.46761139327816, + "learning_rate": 3.2960397000899034e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 3.98046875, + "logps/chosen": -648.0, + "logps/rejected": -591.0, + "loss": 0.5015, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.388671875, + "rewards/margins": 5.49609375, + "rewards/rejected": -4.109375, + "step": 3512 + }, + { + "epoch": 0.6970583858326306, + "grad_norm": 37.50663849662602, + "learning_rate": 3.293322962372215e-07, + "logits/chosen": 4.15625, + "logits/rejected": 3.82421875, + "logps/chosen": -1186.0, + "logps/rejected": -837.0, + "loss": 0.5068, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.779296875, + "rewards/margins": 5.5078125, + "rewards/rejected": -3.732421875, + "step": 3513 + }, + { + "epoch": 0.6972568083734312, + "grad_norm": 32.97511580971527, + "learning_rate": 3.2906072831583037e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.78515625, + "logps/chosen": -1177.0, + "logps/rejected": -775.0, + "loss": 0.4257, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.939453125, + "rewards/margins": 6.4296875, + "rewards/rejected": -4.484375, + "step": 3514 + }, + { + "epoch": 0.6974552309142319, + "grad_norm": 26.16486084719554, + "learning_rate": 3.287892663750833e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.75390625, + "logps/chosen": -692.0, + "logps/rejected": -514.5, + "loss": 0.3835, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3232421875, + "rewards/margins": 6.4453125, + "rewards/rejected": -4.125, + "step": 3515 + }, + { + "epoch": 0.6976536534550325, + "grad_norm": 26.537555827628665, + "learning_rate": 3.2851791054519574e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.0859375, + "logps/chosen": -1446.0, + "logps/rejected": -1921.0, + "loss": 0.3718, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5234375, + "rewards/margins": 12.25, + "rewards/rejected": -9.7421875, + "step": 3516 + }, + { + "epoch": 0.6978520759958331, + "grad_norm": 30.15889659799017, + "learning_rate": 3.282466609563319e-07, + "logits/chosen": 3.640625, + "logits/rejected": 3.5546875, + "logps/chosen": -897.0, + "logps/rejected": -622.5, + "loss": 0.4965, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.462890625, + "rewards/margins": 6.291015625, + "rewards/rejected": -4.83984375, + "step": 3517 + }, + { + "epoch": 0.6980504985366338, + "grad_norm": 36.343905092546464, + "learning_rate": 3.279755177386063e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 3.85546875, + "logps/chosen": -891.0, + "logps/rejected": -688.0, + "loss": 0.3214, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.470703125, + "rewards/margins": 7.5390625, + "rewards/rejected": -5.0703125, + "step": 3518 + }, + { + "epoch": 0.6982489210774344, + "grad_norm": 34.02422142419457, + "learning_rate": 3.2770448102208046e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.01953125, + "logps/chosen": -1035.0, + "logps/rejected": -620.0, + "loss": 0.4898, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.03515625, + "rewards/margins": 6.5859375, + "rewards/rejected": -4.55078125, + "step": 3519 + }, + { + "epoch": 0.698447343618235, + "grad_norm": 32.37219885289826, + "learning_rate": 3.274335509367667e-07, + "logits/chosen": 3.921875, + "logits/rejected": 4.171875, + "logps/chosen": -726.0, + "logps/rejected": -699.0, + "loss": 0.3878, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.771484375, + "rewards/margins": 7.9375, + "rewards/rejected": -5.15625, + "step": 3520 + }, + { + "epoch": 0.6986457661590356, + "grad_norm": 30.722138108144563, + "learning_rate": 3.271627276126249e-07, + "logits/chosen": 4.29296875, + "logits/rejected": 3.99609375, + "logps/chosen": -1233.0, + "logps/rejected": -1012.0, + "loss": 0.3885, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.84375, + "rewards/margins": 9.4609375, + "rewards/rejected": -6.59375, + "step": 3521 + }, + { + "epoch": 0.6988441886998363, + "grad_norm": 34.099306078256824, + "learning_rate": 3.2689201117956445e-07, + "logits/chosen": 4.28515625, + "logits/rejected": 4.10546875, + "logps/chosen": -943.0, + "logps/rejected": -661.5, + "loss": 0.4158, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.11083984375, + "rewards/margins": 7.5, + "rewards/rejected": -5.390625, + "step": 3522 + }, + { + "epoch": 0.6990426112406369, + "grad_norm": 31.65639693401644, + "learning_rate": 3.2662140176744303e-07, + "logits/chosen": 3.90625, + "logits/rejected": 4.375, + "logps/chosen": -722.0, + "logps/rejected": -742.0, + "loss": 0.5826, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.9296875, + "rewards/margins": 5.2109375, + "rewards/rejected": -3.29296875, + "step": 3523 + }, + { + "epoch": 0.6992410337814375, + "grad_norm": 37.07120244324559, + "learning_rate": 3.263508995060672e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 3.87890625, + "logps/chosen": -1137.0, + "logps/rejected": -672.0, + "loss": 0.331, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.09375, + "rewards/margins": 7.8203125, + "rewards/rejected": -4.70703125, + "step": 3524 + }, + { + "epoch": 0.6994394563222383, + "grad_norm": 29.196656900175398, + "learning_rate": 3.260805045251922e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 3.98828125, + "logps/chosen": -869.0, + "logps/rejected": -613.5, + "loss": 0.3299, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.13671875, + "rewards/margins": 7.8671875, + "rewards/rejected": -4.7421875, + "step": 3525 + }, + { + "epoch": 0.6996378788630389, + "grad_norm": 25.197598291640336, + "learning_rate": 3.2581021695452166e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.14453125, + "logps/chosen": -1226.0, + "logps/rejected": -1027.0, + "loss": 0.2182, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.84375, + "rewards/margins": 12.21875, + "rewards/rejected": -8.390625, + "step": 3526 + }, + { + "epoch": 0.6998363014038395, + "grad_norm": 30.15187203720657, + "learning_rate": 3.2554003692370765e-07, + "logits/chosen": 3.62890625, + "logits/rejected": 3.484375, + "logps/chosen": -1257.0, + "logps/rejected": -746.0, + "loss": 0.2852, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.34375, + "rewards/margins": 9.6875, + "rewards/rejected": -6.328125, + "step": 3527 + }, + { + "epoch": 0.7000347239446401, + "grad_norm": 27.256964762010917, + "learning_rate": 3.2526996456235087e-07, + "logits/chosen": 3.828125, + "logits/rejected": 3.78515625, + "logps/chosen": -801.0, + "logps/rejected": -644.0, + "loss": 0.3528, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.51171875, + "rewards/margins": 7.8984375, + "rewards/rejected": -5.37109375, + "step": 3528 + }, + { + "epoch": 0.7002331464854408, + "grad_norm": 28.297767529351393, + "learning_rate": 3.250000000000001e-07, + "logits/chosen": 4.41796875, + "logits/rejected": 3.88671875, + "logps/chosen": -1105.0, + "logps/rejected": -700.0, + "loss": 0.4197, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.990234375, + "rewards/margins": 7.890625, + "rewards/rejected": -4.89453125, + "step": 3529 + }, + { + "epoch": 0.7004315690262414, + "grad_norm": 33.54562355000618, + "learning_rate": 3.2473014336615245e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 3.80859375, + "logps/chosen": -855.0, + "logps/rejected": -637.0, + "loss": 0.4407, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.509765625, + "rewards/margins": 5.92578125, + "rewards/rejected": -3.40625, + "step": 3530 + }, + { + "epoch": 0.700629991567042, + "grad_norm": 24.01541448454831, + "learning_rate": 3.2446039479025377e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.11328125, + "logps/chosen": -1282.0, + "logps/rejected": -853.0, + "loss": 0.3715, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.865234375, + "rewards/margins": 8.3046875, + "rewards/rejected": -5.4375, + "step": 3531 + }, + { + "epoch": 0.7008284141078427, + "grad_norm": 30.276441643449637, + "learning_rate": 3.241907544016971e-07, + "logits/chosen": 3.78515625, + "logits/rejected": 3.51953125, + "logps/chosen": -1082.0, + "logps/rejected": -764.0, + "loss": 0.3635, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.15625, + "rewards/margins": 7.4609375, + "rewards/rejected": -4.3125, + "step": 3532 + }, + { + "epoch": 0.7010268366486433, + "grad_norm": 27.251781166154057, + "learning_rate": 3.2392122232982467e-07, + "logits/chosen": 3.58203125, + "logits/rejected": 3.484375, + "logps/chosen": -1026.0, + "logps/rejected": -1230.5, + "loss": 0.4428, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.15234375, + "rewards/margins": 8.5859375, + "rewards/rejected": -6.416015625, + "step": 3533 + }, + { + "epoch": 0.7012252591894439, + "grad_norm": 33.04303058339974, + "learning_rate": 3.2365179870392586e-07, + "logits/chosen": 3.91015625, + "logits/rejected": 3.89453125, + "logps/chosen": -584.5, + "logps/rejected": -687.0, + "loss": 0.5985, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.58203125, + "rewards/margins": 16.60546875, + "rewards/rejected": -15.01171875, + "step": 3534 + }, + { + "epoch": 0.7014236817302446, + "grad_norm": 36.294152048793805, + "learning_rate": 3.233824836532384e-07, + "logits/chosen": 4.390625, + "logits/rejected": 4.390625, + "logps/chosen": -801.0, + "logps/rejected": -742.5, + "loss": 0.5066, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.5611572265625, + "rewards/margins": 7.15625, + "rewards/rejected": -5.5859375, + "step": 3535 + }, + { + "epoch": 0.7016221042710452, + "grad_norm": 28.53480082496825, + "learning_rate": 3.2311327730694845e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 4.09765625, + "logps/chosen": -872.0, + "logps/rejected": -592.0, + "loss": 0.3032, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6640625, + "rewards/margins": 8.546875, + "rewards/rejected": -5.89453125, + "step": 3536 + }, + { + "epoch": 0.7018205268118458, + "grad_norm": 36.165881700322295, + "learning_rate": 3.2284417979418875e-07, + "logits/chosen": 3.484375, + "logits/rejected": 3.296875, + "logps/chosen": -1217.0, + "logps/rejected": -905.0, + "loss": 0.391, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.771484375, + "rewards/margins": 7.6171875, + "rewards/rejected": -5.84375, + "step": 3537 + }, + { + "epoch": 0.7020189493526464, + "grad_norm": 26.883884935840396, + "learning_rate": 3.225751912440413e-07, + "logits/chosen": 3.32421875, + "logits/rejected": 3.03515625, + "logps/chosen": -1037.0, + "logps/rejected": -584.0, + "loss": 0.334, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.455078125, + "rewards/margins": 7.2734375, + "rewards/rejected": -4.8125, + "step": 3538 + }, + { + "epoch": 0.7022173718934471, + "grad_norm": 29.174957406465108, + "learning_rate": 3.223063117855345e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 3.4765625, + "logps/chosen": -1154.5, + "logps/rejected": -1178.5, + "loss": 0.4694, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9716796875, + "rewards/margins": 9.59375, + "rewards/rejected": -6.65234375, + "step": 3539 + }, + { + "epoch": 0.7024157944342477, + "grad_norm": 37.45831881536641, + "learning_rate": 3.2203754154764546e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.21875, + "logps/chosen": -900.0, + "logps/rejected": -688.0, + "loss": 0.4451, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.125, + "rewards/margins": 6.4453125, + "rewards/rejected": -4.3203125, + "step": 3540 + }, + { + "epoch": 0.7026142169750483, + "grad_norm": 38.0566177878183, + "learning_rate": 3.2176888065929843e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.0, + "logps/chosen": -918.0, + "logps/rejected": -801.0, + "loss": 0.3519, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.89453125, + "rewards/margins": 7.9609375, + "rewards/rejected": -5.0546875, + "step": 3541 + }, + { + "epoch": 0.702812639515849, + "grad_norm": 39.24820632778024, + "learning_rate": 3.21500329249365e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 3.265625, + "logps/chosen": -1296.0, + "logps/rejected": -698.0, + "loss": 0.4937, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.169921875, + "rewards/margins": 7.0234375, + "rewards/rejected": -4.8671875, + "step": 3542 + }, + { + "epoch": 0.7030110620566496, + "grad_norm": 31.529934936949452, + "learning_rate": 3.2123188744666513e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 4.04296875, + "logps/chosen": -932.0, + "logps/rejected": -981.0, + "loss": 0.3649, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.490234375, + "rewards/margins": 9.3984375, + "rewards/rejected": -6.9140625, + "step": 3543 + }, + { + "epoch": 0.7032094845974503, + "grad_norm": 32.12864874492558, + "learning_rate": 3.2096355537996475e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 4.15625, + "logps/chosen": -831.0, + "logps/rejected": -808.0, + "loss": 0.4594, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.41796875, + "rewards/margins": 7.1484375, + "rewards/rejected": -4.7265625, + "step": 3544 + }, + { + "epoch": 0.7034079071382509, + "grad_norm": 39.84504226285521, + "learning_rate": 3.206953331779787e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.3515625, + "logps/chosen": -884.0, + "logps/rejected": -1193.5, + "loss": 0.3512, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4765625, + "rewards/margins": 12.265625, + "rewards/rejected": -10.796875, + "step": 3545 + }, + { + "epoch": 0.7036063296790516, + "grad_norm": 29.64178000578345, + "learning_rate": 3.20427220969368e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.05078125, + "logps/chosen": -903.0, + "logps/rejected": -1052.0, + "loss": 0.399, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0625, + "rewards/margins": 8.296875, + "rewards/rejected": -6.2265625, + "step": 3546 + }, + { + "epoch": 0.7038047522198522, + "grad_norm": 31.741703596223864, + "learning_rate": 3.201592188827416e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 3.8828125, + "logps/chosen": -963.0, + "logps/rejected": -861.0, + "loss": 0.4622, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.923828125, + "rewards/margins": 9.5, + "rewards/rejected": -6.58203125, + "step": 3547 + }, + { + "epoch": 0.7040031747606528, + "grad_norm": 31.279694375012568, + "learning_rate": 3.1989132704665504e-07, + "logits/chosen": 3.8046875, + "logits/rejected": 3.69140625, + "logps/chosen": -700.0, + "logps/rejected": -457.0, + "loss": 0.369, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.87109375, + "rewards/margins": 6.578125, + "rewards/rejected": -4.71484375, + "step": 3548 + }, + { + "epoch": 0.7042015973014535, + "grad_norm": 28.246564001955768, + "learning_rate": 3.1962354558961126e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.2265625, + "logps/chosen": -946.0, + "logps/rejected": -1065.0, + "loss": 0.407, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.74609375, + "rewards/margins": 10.0, + "rewards/rejected": -7.2734375, + "step": 3549 + }, + { + "epoch": 0.7044000198422541, + "grad_norm": 33.697180206980114, + "learning_rate": 3.1935587464006064e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.29296875, + "logps/chosen": -1174.0, + "logps/rejected": -2714.0, + "loss": 0.4198, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.142578125, + "rewards/margins": 15.546875, + "rewards/rejected": -12.390625, + "step": 3550 + }, + { + "epoch": 0.7045984423830547, + "grad_norm": 27.210514885199103, + "learning_rate": 3.190883143263999e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.0703125, + "logps/chosen": -954.0, + "logps/rejected": -769.0, + "loss": 0.4422, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.416015625, + "rewards/margins": 8.578125, + "rewards/rejected": -6.1484375, + "step": 3551 + }, + { + "epoch": 0.7047968649238554, + "grad_norm": 37.19492786701278, + "learning_rate": 3.188208647769731e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 3.84765625, + "logps/chosen": -886.0, + "logps/rejected": -763.0, + "loss": 0.4098, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.810546875, + "rewards/margins": 7.02734375, + "rewards/rejected": -5.21875, + "step": 3552 + }, + { + "epoch": 0.704995287464656, + "grad_norm": 32.60428339445337, + "learning_rate": 3.1855352612007093e-07, + "logits/chosen": 3.890625, + "logits/rejected": 4.125, + "logps/chosen": -898.5, + "logps/rejected": -1343.0, + "loss": 0.4071, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.52734375, + "rewards/margins": 10.8984375, + "rewards/rejected": -7.37109375, + "step": 3553 + }, + { + "epoch": 0.7051937100054566, + "grad_norm": 30.67100741879519, + "learning_rate": 3.182862984839311e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 3.7734375, + "logps/chosen": -868.0, + "logps/rejected": -546.0, + "loss": 0.3953, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.42578125, + "rewards/margins": 7.5546875, + "rewards/rejected": -5.1171875, + "step": 3554 + }, + { + "epoch": 0.7053921325462572, + "grad_norm": 34.59542101148293, + "learning_rate": 3.180191819967376e-07, + "logits/chosen": 3.3828125, + "logits/rejected": 3.44921875, + "logps/chosen": -1003.0, + "logps/rejected": -683.5, + "loss": 0.3772, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7734375, + "rewards/margins": 8.4296875, + "rewards/rejected": -5.671875, + "step": 3555 + }, + { + "epoch": 0.7055905550870579, + "grad_norm": 29.1581864695783, + "learning_rate": 3.17752176786622e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.32421875, + "logps/chosen": -984.5, + "logps/rejected": -809.5, + "loss": 0.3632, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7919921875, + "rewards/margins": 8.9140625, + "rewards/rejected": -6.140625, + "step": 3556 + }, + { + "epoch": 0.7057889776278585, + "grad_norm": 31.406371424899934, + "learning_rate": 3.1748528298166164e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.23828125, + "logps/chosen": -875.0, + "logps/rejected": -816.0, + "loss": 0.2849, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.45703125, + "rewards/margins": 9.5, + "rewards/rejected": -6.046875, + "step": 3557 + }, + { + "epoch": 0.7059874001686591, + "grad_norm": 33.471046867641, + "learning_rate": 3.1721850070988094e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.40625, + "logps/chosen": -954.0, + "logps/rejected": -762.0, + "loss": 0.337, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.55859375, + "rewards/margins": 8.953125, + "rewards/rejected": -5.3984375, + "step": 3558 + }, + { + "epoch": 0.7061858227094598, + "grad_norm": 24.63483384119398, + "learning_rate": 3.169518300992504e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.31640625, + "logps/chosen": -918.0, + "logps/rejected": -653.0, + "loss": 0.3831, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.724609375, + "rewards/margins": 7.6953125, + "rewards/rejected": -4.96484375, + "step": 3559 + }, + { + "epoch": 0.7063842452502604, + "grad_norm": 31.09986612239981, + "learning_rate": 3.1668527127768727e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.85546875, + "logps/chosen": -1048.0, + "logps/rejected": -775.0, + "loss": 0.3982, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.36181640625, + "rewards/margins": 6.859375, + "rewards/rejected": -5.49609375, + "step": 3560 + }, + { + "epoch": 0.706582667791061, + "grad_norm": 34.030924763257495, + "learning_rate": 3.164188243730551e-07, + "logits/chosen": 4.44140625, + "logits/rejected": 4.13671875, + "logps/chosen": -1092.0, + "logps/rejected": -770.0, + "loss": 0.4817, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.21484375, + "rewards/margins": 7.140625, + "rewards/rejected": -3.91015625, + "step": 3561 + }, + { + "epoch": 0.7067810903318617, + "grad_norm": 30.2240833615922, + "learning_rate": 3.1615248951316354e-07, + "logits/chosen": 4.296875, + "logits/rejected": 3.953125, + "logps/chosen": -1623.0, + "logps/rejected": -766.0, + "loss": 0.3389, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.662109375, + "rewards/margins": 9.4140625, + "rewards/rejected": -5.7578125, + "step": 3562 + }, + { + "epoch": 0.7069795128726624, + "grad_norm": 30.881102488223444, + "learning_rate": 3.1588626682576915e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 3.8828125, + "logps/chosen": -1124.0, + "logps/rejected": -1029.0, + "loss": 0.3013, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.337890625, + "rewards/margins": 10.8515625, + "rewards/rejected": -8.5, + "step": 3563 + }, + { + "epoch": 0.707177935413463, + "grad_norm": 29.760041152344012, + "learning_rate": 3.1562015643857365e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 3.671875, + "logps/chosen": -912.0, + "logps/rejected": -815.0, + "loss": 0.3882, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.80078125, + "rewards/margins": 11.6484375, + "rewards/rejected": -8.8486328125, + "step": 3564 + }, + { + "epoch": 0.7073763579542636, + "grad_norm": 46.392888723355824, + "learning_rate": 3.1535415847922585e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.01953125, + "logps/chosen": -955.0, + "logps/rejected": -779.0, + "loss": 0.4671, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.63037109375, + "rewards/margins": 8.10546875, + "rewards/rejected": -7.48828125, + "step": 3565 + }, + { + "epoch": 0.7075747804950643, + "grad_norm": 33.54545604255824, + "learning_rate": 3.1508827307532027e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.20703125, + "logps/chosen": -1187.0, + "logps/rejected": -816.0, + "loss": 0.5105, + "rewards/accuracies": 0.71875, + "rewards/chosen": 3.021484375, + "rewards/margins": 10.203125, + "rewards/rejected": -7.15625, + "step": 3566 + }, + { + "epoch": 0.7077732030358649, + "grad_norm": 23.18934496640446, + "learning_rate": 3.148225003543971e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 4.1015625, + "logps/chosen": -866.5, + "logps/rejected": -1447.0, + "loss": 0.4679, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.486328125, + "rewards/margins": 10.9296875, + "rewards/rejected": -8.453125, + "step": 3567 + }, + { + "epoch": 0.7079716255766655, + "grad_norm": 35.364495620903085, + "learning_rate": 3.1455684044394326e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 3.8828125, + "logps/chosen": -993.5, + "logps/rejected": -1271.0, + "loss": 0.3586, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.943359375, + "rewards/margins": 9.5859375, + "rewards/rejected": -7.65625, + "step": 3568 + }, + { + "epoch": 0.7081700481174661, + "grad_norm": 48.6475571470462, + "learning_rate": 3.142912934713905e-07, + "logits/chosen": 4.359375, + "logits/rejected": 4.375, + "logps/chosen": -1140.0, + "logps/rejected": -1976.5, + "loss": 0.3928, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.35546875, + "rewards/margins": 12.8671875, + "rewards/rejected": -10.5078125, + "step": 3569 + }, + { + "epoch": 0.7083684706582668, + "grad_norm": 31.007718477663417, + "learning_rate": 3.1402585956411746e-07, + "logits/chosen": 3.6953125, + "logits/rejected": 4.2265625, + "logps/chosen": -845.0, + "logps/rejected": -1109.5, + "loss": 0.4113, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1640625, + "rewards/margins": 10.4765625, + "rewards/rejected": -8.31640625, + "step": 3570 + }, + { + "epoch": 0.7085668931990674, + "grad_norm": 31.072027218778572, + "learning_rate": 3.1376053884944796e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 3.7734375, + "logps/chosen": -1062.5, + "logps/rejected": -830.0, + "loss": 0.2816, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.95703125, + "rewards/margins": 17.390625, + "rewards/rejected": -15.453125, + "step": 3571 + }, + { + "epoch": 0.708765315739868, + "grad_norm": 31.292328848873648, + "learning_rate": 3.134953314546514e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.859375, + "logps/chosen": -1183.0, + "logps/rejected": -719.0, + "loss": 0.2992, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.91796875, + "rewards/margins": 8.3671875, + "rewards/rejected": -5.453125, + "step": 3572 + }, + { + "epoch": 0.7089637382806687, + "grad_norm": 36.98122697522513, + "learning_rate": 3.1323023750694357e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 3.94921875, + "logps/chosen": -1182.0, + "logps/rejected": -859.0, + "loss": 0.4031, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.4375, + "rewards/margins": 8.1640625, + "rewards/rejected": -4.7578125, + "step": 3573 + }, + { + "epoch": 0.7091621608214693, + "grad_norm": 25.778411861771318, + "learning_rate": 3.1296525713348464e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.03515625, + "logps/chosen": -611.5, + "logps/rejected": -445.0, + "loss": 0.4145, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7841796875, + "rewards/margins": 6.6953125, + "rewards/rejected": -3.90625, + "step": 3574 + }, + { + "epoch": 0.7093605833622699, + "grad_norm": 37.144472329569965, + "learning_rate": 3.1270039046138157e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.81640625, + "logps/chosen": -1063.5, + "logps/rejected": -846.25, + "loss": 0.4342, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.853515625, + "rewards/margins": 8.81640625, + "rewards/rejected": -5.9765625, + "step": 3575 + }, + { + "epoch": 0.7095590059030706, + "grad_norm": 37.60050205475388, + "learning_rate": 3.12435637617686e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.75390625, + "logps/chosen": -791.0, + "logps/rejected": -585.0, + "loss": 0.3261, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.771484375, + "rewards/margins": 7.8984375, + "rewards/rejected": -5.125, + "step": 3576 + }, + { + "epoch": 0.7097574284438712, + "grad_norm": 29.579382838869876, + "learning_rate": 3.1217099872939504e-07, + "logits/chosen": 3.43359375, + "logits/rejected": 3.34375, + "logps/chosen": -915.0, + "logps/rejected": -640.5, + "loss": 0.3726, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.634765625, + "rewards/margins": 7.421875, + "rewards/rejected": -4.78515625, + "step": 3577 + }, + { + "epoch": 0.7099558509846718, + "grad_norm": 29.279352760077252, + "learning_rate": 3.119064739234514e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.53125, + "logps/chosen": -926.0, + "logps/rejected": -884.5, + "loss": 0.395, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.236328125, + "rewards/margins": 8.890625, + "rewards/rejected": -6.6484375, + "step": 3578 + }, + { + "epoch": 0.7101542735254724, + "grad_norm": 32.74363352164116, + "learning_rate": 3.11642063326743e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 3.9765625, + "logps/chosen": -1226.0, + "logps/rejected": -795.0, + "loss": 0.3674, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.76953125, + "rewards/margins": 9.203125, + "rewards/rejected": -5.4296875, + "step": 3579 + }, + { + "epoch": 0.7103526960662732, + "grad_norm": 25.565279178943793, + "learning_rate": 3.1137776706610274e-07, + "logits/chosen": 4.4375, + "logits/rejected": 3.98046875, + "logps/chosen": -1079.0, + "logps/rejected": -836.0, + "loss": 0.2631, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.54296875, + "rewards/margins": 11.34375, + "rewards/rejected": -7.8046875, + "step": 3580 + }, + { + "epoch": 0.7105511186070738, + "grad_norm": 25.231638392720885, + "learning_rate": 3.1111358526830866e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 3.94921875, + "logps/chosen": -1048.0, + "logps/rejected": -679.0, + "loss": 0.2417, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4765625, + "rewards/margins": 9.890625, + "rewards/rejected": -6.3984375, + "step": 3581 + }, + { + "epoch": 0.7107495411478744, + "grad_norm": 30.14606890779192, + "learning_rate": 3.1084951806008466e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.05859375, + "logps/chosen": -1108.0, + "logps/rejected": -761.0, + "loss": 0.4336, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8388671875, + "rewards/margins": 7.5546875, + "rewards/rejected": -5.7109375, + "step": 3582 + }, + { + "epoch": 0.7109479636886751, + "grad_norm": 32.00221588376034, + "learning_rate": 3.105855655680986e-07, + "logits/chosen": 3.8515625, + "logits/rejected": 4.015625, + "logps/chosen": -938.5, + "logps/rejected": -843.5, + "loss": 0.4963, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.3271484375, + "rewards/margins": 8.359375, + "rewards/rejected": -6.04296875, + "step": 3583 + }, + { + "epoch": 0.7111463862294757, + "grad_norm": 23.131122199585334, + "learning_rate": 3.1032172791896394e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 3.8984375, + "logps/chosen": -1299.0, + "logps/rejected": -1271.5, + "loss": 0.3904, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5009765625, + "rewards/margins": 10.2265625, + "rewards/rejected": -7.72265625, + "step": 3584 + }, + { + "epoch": 0.7113448087702763, + "grad_norm": 33.93887290618266, + "learning_rate": 3.1005800523923903e-07, + "logits/chosen": 4.05078125, + "logits/rejected": 3.86328125, + "logps/chosen": -941.0, + "logps/rejected": -729.5, + "loss": 0.3154, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.48828125, + "rewards/margins": 9.0, + "rewards/rejected": -6.53125, + "step": 3585 + }, + { + "epoch": 0.7115432313110769, + "grad_norm": 25.24349536110481, + "learning_rate": 3.0979439765542683e-07, + "logits/chosen": 3.5625, + "logits/rejected": 3.75, + "logps/chosen": -1116.0, + "logps/rejected": -2001.0, + "loss": 0.4164, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5546875, + "rewards/margins": 12.234375, + "rewards/rejected": -9.6796875, + "step": 3586 + }, + { + "epoch": 0.7117416538518776, + "grad_norm": 26.871604514065023, + "learning_rate": 3.0953090529397495e-07, + "logits/chosen": 4.48828125, + "logits/rejected": 4.46484375, + "logps/chosen": -968.0, + "logps/rejected": -635.0, + "loss": 0.3452, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.822265625, + "rewards/margins": 8.421875, + "rewards/rejected": -5.609375, + "step": 3587 + }, + { + "epoch": 0.7119400763926782, + "grad_norm": 30.8779802456441, + "learning_rate": 3.092675282812766e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 3.86328125, + "logps/chosen": -893.0, + "logps/rejected": -764.0, + "loss": 0.3913, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.26953125, + "rewards/margins": 9.6796875, + "rewards/rejected": -7.40625, + "step": 3588 + }, + { + "epoch": 0.7121384989334788, + "grad_norm": 26.354476240502436, + "learning_rate": 3.0900426674366877e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.3359375, + "logps/chosen": -1139.0, + "logps/rejected": -828.0, + "loss": 0.3123, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.59765625, + "rewards/margins": 8.65625, + "rewards/rejected": -6.0546875, + "step": 3589 + }, + { + "epoch": 0.7123369214742795, + "grad_norm": 30.831763251543947, + "learning_rate": 3.0874112080743313e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 3.71484375, + "logps/chosen": -968.0, + "logps/rejected": -960.0, + "loss": 0.4967, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.9375, + "rewards/margins": 9.9921875, + "rewards/rejected": -8.046875, + "step": 3590 + }, + { + "epoch": 0.7125353440150801, + "grad_norm": 41.075640768239595, + "learning_rate": 3.0847809059879636e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 3.7109375, + "logps/chosen": -1134.0, + "logps/rejected": -827.0, + "loss": 0.4888, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.2265625, + "rewards/margins": 6.40625, + "rewards/rejected": -4.18359375, + "step": 3591 + }, + { + "epoch": 0.7127337665558807, + "grad_norm": 33.68011568006158, + "learning_rate": 3.0821517624392925e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.12109375, + "logps/chosen": -725.5, + "logps/rejected": -1089.5, + "loss": 0.5347, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.291015625, + "rewards/margins": 6.640625, + "rewards/rejected": -4.361328125, + "step": 3592 + }, + { + "epoch": 0.7129321890966814, + "grad_norm": 20.814695485390494, + "learning_rate": 3.079523778689474e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 3.60546875, + "logps/chosen": -1144.0, + "logps/rejected": -764.0, + "loss": 0.3181, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.56640625, + "rewards/margins": 10.2578125, + "rewards/rejected": -6.69921875, + "step": 3593 + }, + { + "epoch": 0.713130611637482, + "grad_norm": 27.315561362614876, + "learning_rate": 3.076896955999101e-07, + "logits/chosen": 3.828125, + "logits/rejected": 3.66015625, + "logps/chosen": -1150.0, + "logps/rejected": -713.5, + "loss": 0.3753, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.11328125, + "rewards/margins": 8.890625, + "rewards/rejected": -5.7734375, + "step": 3594 + }, + { + "epoch": 0.7133290341782826, + "grad_norm": 34.670998773428174, + "learning_rate": 3.074271295628217e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.15234375, + "logps/chosen": -993.0, + "logps/rejected": -801.0, + "loss": 0.4564, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.916259765625, + "rewards/margins": 7.4296875, + "rewards/rejected": -5.5078125, + "step": 3595 + }, + { + "epoch": 0.7135274567190832, + "grad_norm": 30.557040434417534, + "learning_rate": 3.0716467988363037e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 3.76171875, + "logps/chosen": -1606.0, + "logps/rejected": -694.0, + "loss": 0.3871, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.103515625, + "rewards/margins": 5.84765625, + "rewards/rejected": -5.953125, + "step": 3596 + }, + { + "epoch": 0.713725879259884, + "grad_norm": 32.340784484293266, + "learning_rate": 3.0690234668822837e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 4.09375, + "logps/chosen": -948.0, + "logps/rejected": -662.5, + "loss": 0.3551, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.71875, + "rewards/margins": 7.5546875, + "rewards/rejected": -4.84765625, + "step": 3597 + }, + { + "epoch": 0.7139243018006846, + "grad_norm": 36.290523249500765, + "learning_rate": 3.066401301024528e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.80859375, + "logps/chosen": -1163.0, + "logps/rejected": -837.0, + "loss": 0.4042, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.89453125, + "rewards/margins": 8.9453125, + "rewards/rejected": -6.03125, + "step": 3598 + }, + { + "epoch": 0.7141227243414852, + "grad_norm": 41.292184570769955, + "learning_rate": 3.0637803025208364e-07, + "logits/chosen": 3.8515625, + "logits/rejected": 3.8359375, + "logps/chosen": -989.0, + "logps/rejected": -705.0, + "loss": 0.5189, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.92578125, + "rewards/margins": 5.6796875, + "rewards/rejected": -3.74609375, + "step": 3599 + }, + { + "epoch": 0.7143211468822859, + "grad_norm": 24.32177867982252, + "learning_rate": 3.06116047262846e-07, + "logits/chosen": 4.6328125, + "logits/rejected": 4.58984375, + "logps/chosen": -694.0, + "logps/rejected": -706.0, + "loss": 0.4407, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.7537841796875, + "rewards/margins": 7.6328125, + "rewards/rejected": -5.8671875, + "step": 3600 + }, + { + "epoch": 0.7145195694230865, + "grad_norm": 33.40467582454036, + "learning_rate": 3.058541812604083e-07, + "logits/chosen": 3.984375, + "logits/rejected": 3.98046875, + "logps/chosen": -1123.0, + "logps/rejected": -848.0, + "loss": 0.3709, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.9169921875, + "rewards/margins": 9.390625, + "rewards/rejected": -7.484375, + "step": 3601 + }, + { + "epoch": 0.7147179919638871, + "grad_norm": 22.915681189932748, + "learning_rate": 3.05592432370383e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.30078125, + "logps/chosen": -903.0, + "logps/rejected": -682.0, + "loss": 0.3924, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.544921875, + "rewards/margins": 7.171875, + "rewards/rejected": -4.63671875, + "step": 3602 + }, + { + "epoch": 0.7149164145046877, + "grad_norm": 23.717886181691924, + "learning_rate": 3.053308007183264e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 3.80078125, + "logps/chosen": -717.0, + "logps/rejected": -525.0, + "loss": 0.3545, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.1787109375, + "rewards/margins": 7.0390625, + "rewards/rejected": -4.86328125, + "step": 3603 + }, + { + "epoch": 0.7151148370454884, + "grad_norm": 29.875254451332392, + "learning_rate": 3.0506928642973843e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 3.94921875, + "logps/chosen": -865.0, + "logps/rejected": -687.5, + "loss": 0.4161, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.751953125, + "rewards/margins": 18.9140625, + "rewards/rejected": -16.1796875, + "step": 3604 + }, + { + "epoch": 0.715313259586289, + "grad_norm": 32.05068606215375, + "learning_rate": 3.048078896300634e-07, + "logits/chosen": 4.32421875, + "logits/rejected": 4.29296875, + "logps/chosen": -956.0, + "logps/rejected": -768.0, + "loss": 0.4148, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.1171875, + "rewards/margins": 7.4921875, + "rewards/rejected": -5.3671875, + "step": 3605 + }, + { + "epoch": 0.7155116821270896, + "grad_norm": 29.221561637983992, + "learning_rate": 3.045466104446879e-07, + "logits/chosen": 3.6796875, + "logits/rejected": 3.86328125, + "logps/chosen": -1234.0, + "logps/rejected": -1280.5, + "loss": 0.3723, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.83203125, + "rewards/margins": 11.515625, + "rewards/rejected": -8.6875, + "step": 3606 + }, + { + "epoch": 0.7157101046678903, + "grad_norm": 29.65619444259686, + "learning_rate": 3.042854489989435e-07, + "logits/chosen": 4.3828125, + "logits/rejected": 3.953125, + "logps/chosen": -1355.0, + "logps/rejected": -733.0, + "loss": 0.4103, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.369140625, + "rewards/margins": 8.234375, + "rewards/rejected": -5.859375, + "step": 3607 + }, + { + "epoch": 0.7159085272086909, + "grad_norm": 23.793414378254795, + "learning_rate": 3.040244054181045e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.1328125, + "logps/chosen": -1044.0, + "logps/rejected": -713.5, + "loss": 0.416, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7353515625, + "rewards/margins": 8.0078125, + "rewards/rejected": -5.265625, + "step": 3608 + }, + { + "epoch": 0.7161069497494915, + "grad_norm": 26.86716155905887, + "learning_rate": 3.037634798273888e-07, + "logits/chosen": 3.96875, + "logits/rejected": 3.7421875, + "logps/chosen": -996.0, + "logps/rejected": -758.0, + "loss": 0.3536, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.900390625, + "rewards/margins": 7.84375, + "rewards/rejected": -5.9609375, + "step": 3609 + }, + { + "epoch": 0.7163053722902922, + "grad_norm": 28.333051205006146, + "learning_rate": 3.035026723519579e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 3.8828125, + "logps/chosen": -1126.5, + "logps/rejected": -623.5, + "loss": 0.5391, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.978515625, + "rewards/margins": 6.09375, + "rewards/rejected": -4.11328125, + "step": 3610 + }, + { + "epoch": 0.7165037948310928, + "grad_norm": 28.221530143573492, + "learning_rate": 3.0324198311691665e-07, + "logits/chosen": 3.8359375, + "logits/rejected": 3.83203125, + "logps/chosen": -1121.0, + "logps/rejected": -791.0, + "loss": 0.3856, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.2421875, + "rewards/margins": 8.8359375, + "rewards/rejected": -5.578125, + "step": 3611 + }, + { + "epoch": 0.7167022173718934, + "grad_norm": 28.74012026418325, + "learning_rate": 3.0298141224731257e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 3.7265625, + "logps/chosen": -746.0, + "logps/rejected": -599.5, + "loss": 0.4718, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6962890625, + "rewards/margins": 6.5859375, + "rewards/rejected": -3.8984375, + "step": 3612 + }, + { + "epoch": 0.716900639912694, + "grad_norm": 35.182323396742305, + "learning_rate": 3.027209598681373e-07, + "logits/chosen": 4.21875, + "logits/rejected": 3.81640625, + "logps/chosen": -1261.0, + "logps/rejected": -787.0, + "loss": 0.4657, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.626953125, + "rewards/margins": 7.12109375, + "rewards/rejected": -4.4912109375, + "step": 3613 + }, + { + "epoch": 0.7170990624534948, + "grad_norm": 45.46032938785077, + "learning_rate": 3.024606261043251e-07, + "logits/chosen": 3.47265625, + "logits/rejected": 3.7578125, + "logps/chosen": -743.0, + "logps/rejected": -635.0, + "loss": 0.6755, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.5107421875, + "rewards/margins": 5.10546875, + "rewards/rejected": -3.595703125, + "step": 3614 + }, + { + "epoch": 0.7172974849942954, + "grad_norm": 27.52167640957033, + "learning_rate": 3.0220041108075353e-07, + "logits/chosen": 3.71484375, + "logits/rejected": 3.7109375, + "logps/chosen": -750.5, + "logps/rejected": -829.0, + "loss": 0.534, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.458984375, + "rewards/margins": 6.5390625, + "rewards/rejected": -5.078125, + "step": 3615 + }, + { + "epoch": 0.717495907535096, + "grad_norm": 41.59503677084152, + "learning_rate": 3.01940314922243e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 3.9921875, + "logps/chosen": -1202.0, + "logps/rejected": -767.0, + "loss": 0.498, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.96142578125, + "rewards/margins": 7.71875, + "rewards/rejected": -5.75, + "step": 3616 + }, + { + "epoch": 0.7176943300758967, + "grad_norm": 29.69946669502992, + "learning_rate": 3.016803377535568e-07, + "logits/chosen": 3.609375, + "logits/rejected": 3.55859375, + "logps/chosen": -917.0, + "logps/rejected": -706.0, + "loss": 0.4295, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.560546875, + "rewards/margins": 7.28125, + "rewards/rejected": -4.71875, + "step": 3617 + }, + { + "epoch": 0.7178927526166973, + "grad_norm": 32.61735040714865, + "learning_rate": 3.0142047969940204e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.3828125, + "logps/chosen": -944.0, + "logps/rejected": -845.0, + "loss": 0.4124, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.01171875, + "rewards/margins": 7.4296875, + "rewards/rejected": -4.42578125, + "step": 3618 + }, + { + "epoch": 0.7180911751574979, + "grad_norm": 36.084020871581146, + "learning_rate": 3.0116074088442724e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.1875, + "logps/chosen": -789.0, + "logps/rejected": -775.0, + "loss": 0.4568, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.5634765625, + "rewards/margins": 6.71875, + "rewards/rejected": -5.15625, + "step": 3619 + }, + { + "epoch": 0.7182895976982985, + "grad_norm": 30.99225409223812, + "learning_rate": 3.00901121433225e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.2109375, + "logps/chosen": -1045.0, + "logps/rejected": -1142.0, + "loss": 0.4503, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.82421875, + "rewards/margins": 13.1796875, + "rewards/rejected": -10.376953125, + "step": 3620 + }, + { + "epoch": 0.7184880202390992, + "grad_norm": 27.04555222959575, + "learning_rate": 3.0064162147033e-07, + "logits/chosen": 4.0, + "logits/rejected": 4.234375, + "logps/chosen": -1070.0, + "logps/rejected": -854.0, + "loss": 0.2781, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3828125, + "rewards/margins": 9.6875, + "rewards/rejected": -6.2890625, + "step": 3621 + }, + { + "epoch": 0.7186864427798998, + "grad_norm": 28.070430330154544, + "learning_rate": 3.003822411202195e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 3.70703125, + "logps/chosen": -1176.0, + "logps/rejected": -786.5, + "loss": 0.4329, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.03125, + "rewards/margins": 7.4453125, + "rewards/rejected": -4.421875, + "step": 3622 + }, + { + "epoch": 0.7188848653207004, + "grad_norm": 29.822444463781267, + "learning_rate": 3.0012298050731437e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 4.18359375, + "logps/chosen": -917.0, + "logps/rejected": -1363.0, + "loss": 0.321, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.81640625, + "rewards/margins": 11.3671875, + "rewards/rejected": -8.5625, + "step": 3623 + }, + { + "epoch": 0.7190832878615011, + "grad_norm": 31.04660388297879, + "learning_rate": 2.9986383975597647e-07, + "logits/chosen": 3.6953125, + "logits/rejected": 3.84375, + "logps/chosen": -911.5, + "logps/rejected": -718.0, + "loss": 0.4552, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.2763671875, + "rewards/margins": 6.03515625, + "rewards/rejected": -3.74609375, + "step": 3624 + }, + { + "epoch": 0.7192817104023017, + "grad_norm": 28.444367066637003, + "learning_rate": 2.996048189905117e-07, + "logits/chosen": 3.91015625, + "logits/rejected": 4.3359375, + "logps/chosen": -846.0, + "logps/rejected": -772.0, + "loss": 0.4087, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.25, + "rewards/margins": 8.3125, + "rewards/rejected": -6.05859375, + "step": 3625 + }, + { + "epoch": 0.7194801329431023, + "grad_norm": 33.04515496631556, + "learning_rate": 2.9934591833516713e-07, + "logits/chosen": 3.6875, + "logits/rejected": 3.6875, + "logps/chosen": -1516.0, + "logps/rejected": -1029.0, + "loss": 0.3537, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.4453125, + "rewards/margins": 9.03125, + "rewards/rejected": -5.5859375, + "step": 3626 + }, + { + "epoch": 0.719678555483903, + "grad_norm": 35.33779635499157, + "learning_rate": 2.9908713791413326e-07, + "logits/chosen": 3.66015625, + "logits/rejected": 3.8203125, + "logps/chosen": -961.0, + "logps/rejected": -824.0, + "loss": 0.3781, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.123046875, + "rewards/margins": 6.9609375, + "rewards/rejected": -4.8359375, + "step": 3627 + }, + { + "epoch": 0.7198769780247036, + "grad_norm": 36.410615979112514, + "learning_rate": 2.988284778515423e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 3.7265625, + "logps/chosen": -847.5, + "logps/rejected": -652.0, + "loss": 0.4513, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.814453125, + "rewards/margins": 6.421875, + "rewards/rejected": -4.61328125, + "step": 3628 + }, + { + "epoch": 0.7200754005655042, + "grad_norm": 30.301271767273487, + "learning_rate": 2.985699382714687e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.17578125, + "logps/chosen": -761.0, + "logps/rejected": -581.5, + "loss": 0.4241, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.171875, + "rewards/margins": 6.6328125, + "rewards/rejected": -4.4609375, + "step": 3629 + }, + { + "epoch": 0.7202738231063048, + "grad_norm": 29.045078057697072, + "learning_rate": 2.9831151929792984e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 3.94140625, + "logps/chosen": -1238.0, + "logps/rejected": -765.5, + "loss": 0.4155, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.15234375, + "rewards/margins": 7.6171875, + "rewards/rejected": -4.4609375, + "step": 3630 + }, + { + "epoch": 0.7204722456471055, + "grad_norm": 32.73568107680317, + "learning_rate": 2.98053221054884e-07, + "logits/chosen": 4.046875, + "logits/rejected": 4.109375, + "logps/chosen": -896.0, + "logps/rejected": -637.5, + "loss": 0.2787, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.34765625, + "rewards/margins": 8.8671875, + "rewards/rejected": -5.5078125, + "step": 3631 + }, + { + "epoch": 0.7206706681879061, + "grad_norm": 30.102108826266445, + "learning_rate": 2.9779504366623286e-07, + "logits/chosen": 3.64453125, + "logits/rejected": 3.16796875, + "logps/chosen": -740.0, + "logps/rejected": -496.0, + "loss": 0.4061, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.369140625, + "rewards/margins": 7.05859375, + "rewards/rejected": -4.6875, + "step": 3632 + }, + { + "epoch": 0.7208690907287068, + "grad_norm": 36.12277794314827, + "learning_rate": 2.9753698725581923e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 3.98046875, + "logps/chosen": -1140.0, + "logps/rejected": -769.0, + "loss": 0.4418, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.796875, + "rewards/margins": 13.75, + "rewards/rejected": -9.96875, + "step": 3633 + }, + { + "epoch": 0.7210675132695075, + "grad_norm": 35.34602487109212, + "learning_rate": 2.9727905194742824e-07, + "logits/chosen": 3.87109375, + "logits/rejected": 3.87890625, + "logps/chosen": -1063.0, + "logps/rejected": -677.0, + "loss": 0.425, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.068359375, + "rewards/margins": 8.25, + "rewards/rejected": -5.203125, + "step": 3634 + }, + { + "epoch": 0.7212659358103081, + "grad_norm": 46.14942760111825, + "learning_rate": 2.9702123786478695e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 3.81640625, + "logps/chosen": -1015.0, + "logps/rejected": -750.5, + "loss": 0.4045, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.001953125, + "rewards/margins": 6.5390625, + "rewards/rejected": -4.546875, + "step": 3635 + }, + { + "epoch": 0.7214643583511087, + "grad_norm": 33.69350775479014, + "learning_rate": 2.9676354513156387e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 4.02734375, + "logps/chosen": -1073.0, + "logps/rejected": -795.0, + "loss": 0.4472, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.0595703125, + "rewards/margins": 7.5546875, + "rewards/rejected": -5.5078125, + "step": 3636 + }, + { + "epoch": 0.7216627808919093, + "grad_norm": 26.54814287980162, + "learning_rate": 2.965059738713701e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 3.8515625, + "logps/chosen": -1308.0, + "logps/rejected": -1852.0, + "loss": 0.437, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.5087890625, + "rewards/margins": 11.1875, + "rewards/rejected": -9.6953125, + "step": 3637 + }, + { + "epoch": 0.72186120343271, + "grad_norm": 37.50818649297479, + "learning_rate": 2.962485242077577e-07, + "logits/chosen": 4.05078125, + "logits/rejected": 3.84765625, + "logps/chosen": -1032.0, + "logps/rejected": -943.0, + "loss": 0.4155, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.10546875, + "rewards/margins": 8.10546875, + "rewards/rejected": -5.9921875, + "step": 3638 + }, + { + "epoch": 0.7220596259735106, + "grad_norm": 33.993268006677155, + "learning_rate": 2.9599119626422085e-07, + "logits/chosen": 3.9375, + "logits/rejected": 4.12890625, + "logps/chosen": -950.0, + "logps/rejected": -1423.0, + "loss": 0.3966, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.2080078125, + "rewards/margins": 9.625, + "rewards/rejected": -7.4140625, + "step": 3639 + }, + { + "epoch": 0.7222580485143112, + "grad_norm": 25.999273850719256, + "learning_rate": 2.9573399016419515e-07, + "logits/chosen": 3.50390625, + "logits/rejected": 3.546875, + "logps/chosen": -901.0, + "logps/rejected": -817.0, + "loss": 0.4156, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.365234375, + "rewards/margins": 6.9296875, + "rewards/rejected": -4.56640625, + "step": 3640 + }, + { + "epoch": 0.7224564710551119, + "grad_norm": 25.455107145935735, + "learning_rate": 2.954769060310577e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 3.8828125, + "logps/chosen": -1153.0, + "logps/rejected": -792.0, + "loss": 0.2985, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.4765625, + "rewards/margins": 10.3359375, + "rewards/rejected": -6.875, + "step": 3641 + }, + { + "epoch": 0.7226548935959125, + "grad_norm": 29.40666371190966, + "learning_rate": 2.9521994398812717e-07, + "logits/chosen": 3.19140625, + "logits/rejected": 3.21875, + "logps/chosen": -718.0, + "logps/rejected": -601.75, + "loss": 0.4933, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.9892578125, + "rewards/margins": 6.87109375, + "rewards/rejected": -4.890625, + "step": 3642 + }, + { + "epoch": 0.7228533161367131, + "grad_norm": 39.910520936672384, + "learning_rate": 2.9496310415866407e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 4.1171875, + "logps/chosen": -1061.0, + "logps/rejected": -1493.0, + "loss": 0.2614, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.82421875, + "rewards/margins": 11.734375, + "rewards/rejected": -8.90625, + "step": 3643 + }, + { + "epoch": 0.7230517386775137, + "grad_norm": 37.53308396760362, + "learning_rate": 2.947063866658692e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 3.64453125, + "logps/chosen": -880.5, + "logps/rejected": -540.5, + "loss": 0.558, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8203125, + "rewards/margins": 6.34375, + "rewards/rejected": -4.515625, + "step": 3644 + }, + { + "epoch": 0.7232501612183144, + "grad_norm": 34.675111858252336, + "learning_rate": 2.9444979163288605e-07, + "logits/chosen": 4.37890625, + "logits/rejected": 4.41015625, + "logps/chosen": -951.0, + "logps/rejected": -803.0, + "loss": 0.4347, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7421875, + "rewards/margins": 11.8671875, + "rewards/rejected": -9.1015625, + "step": 3645 + }, + { + "epoch": 0.723448583759115, + "grad_norm": 34.567896894961066, + "learning_rate": 2.941933191827985e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.38671875, + "logps/chosen": -1125.0, + "logps/rejected": -695.5, + "loss": 0.3438, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.490234375, + "rewards/margins": 8.8515625, + "rewards/rejected": -6.375, + "step": 3646 + }, + { + "epoch": 0.7236470062999156, + "grad_norm": 29.20438200811106, + "learning_rate": 2.939369694386317e-07, + "logits/chosen": 4.328125, + "logits/rejected": 4.03125, + "logps/chosen": -1012.5, + "logps/rejected": -738.0, + "loss": 0.355, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.08203125, + "rewards/margins": 7.6953125, + "rewards/rejected": -4.62890625, + "step": 3647 + }, + { + "epoch": 0.7238454288407163, + "grad_norm": 34.446622097214835, + "learning_rate": 2.936807425233522e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 3.94921875, + "logps/chosen": -917.0, + "logps/rejected": -629.0, + "loss": 0.4804, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.3125, + "rewards/margins": 6.671875, + "rewards/rejected": -4.3515625, + "step": 3648 + }, + { + "epoch": 0.724043851381517, + "grad_norm": 32.98624016913571, + "learning_rate": 2.934246385598672e-07, + "logits/chosen": 3.61328125, + "logits/rejected": 3.55859375, + "logps/chosen": -983.0, + "logps/rejected": -613.0, + "loss": 0.3757, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.158203125, + "rewards/margins": 8.828125, + "rewards/rejected": -5.671875, + "step": 3649 + }, + { + "epoch": 0.7242422739223175, + "grad_norm": 28.847049274078508, + "learning_rate": 2.931686576710258e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.25390625, + "logps/chosen": -884.5, + "logps/rejected": -739.0, + "loss": 0.2462, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.88671875, + "rewards/margins": 8.953125, + "rewards/rejected": -6.046875, + "step": 3650 + }, + { + "epoch": 0.7244406964631183, + "grad_norm": 27.376237920020746, + "learning_rate": 2.9291279997961693e-07, + "logits/chosen": 3.39453125, + "logits/rejected": 3.67578125, + "logps/chosen": -1103.0, + "logps/rejected": -1647.0, + "loss": 0.3601, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.76171875, + "rewards/margins": 11.265625, + "rewards/rejected": -8.5078125, + "step": 3651 + }, + { + "epoch": 0.7246391190039189, + "grad_norm": 30.17613486570595, + "learning_rate": 2.926570656083712e-07, + "logits/chosen": 3.984375, + "logits/rejected": 3.9453125, + "logps/chosen": -1018.0, + "logps/rejected": -1032.5, + "loss": 0.3885, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1171875, + "rewards/margins": 8.0546875, + "rewards/rejected": -5.9609375, + "step": 3652 + }, + { + "epoch": 0.7248375415447195, + "grad_norm": 28.358005137020033, + "learning_rate": 2.9240145467996e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.65234375, + "logps/chosen": -888.0, + "logps/rejected": -632.5, + "loss": 0.3642, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.98046875, + "rewards/margins": 7.828125, + "rewards/rejected": -4.859375, + "step": 3653 + }, + { + "epoch": 0.7250359640855201, + "grad_norm": 42.244633841207786, + "learning_rate": 2.9214596731699495e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.16015625, + "logps/chosen": -920.0, + "logps/rejected": -896.0, + "loss": 0.4208, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.146484375, + "rewards/margins": 8.1953125, + "rewards/rejected": -6.0234375, + "step": 3654 + }, + { + "epoch": 0.7252343866263208, + "grad_norm": 29.952472435374364, + "learning_rate": 2.918906036420294e-07, + "logits/chosen": 3.63671875, + "logits/rejected": 3.71875, + "logps/chosen": -1190.0, + "logps/rejected": -693.5, + "loss": 0.4392, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.208984375, + "rewards/margins": 7.4140625, + "rewards/rejected": -5.20703125, + "step": 3655 + }, + { + "epoch": 0.7254328091671214, + "grad_norm": 25.341236409391474, + "learning_rate": 2.91635363777556e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 3.93359375, + "logps/chosen": -1064.0, + "logps/rejected": -848.0, + "loss": 0.4782, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.927734375, + "rewards/margins": 8.125, + "rewards/rejected": -6.19140625, + "step": 3656 + }, + { + "epoch": 0.725631231707922, + "grad_norm": 28.747439187338053, + "learning_rate": 2.9138024784600947e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.0625, + "logps/chosen": -956.0, + "logps/rejected": -1133.0, + "loss": 0.4384, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.091796875, + "rewards/margins": 8.109375, + "rewards/rejected": -6.015625, + "step": 3657 + }, + { + "epoch": 0.7258296542487227, + "grad_norm": 25.463109615948035, + "learning_rate": 2.9112525596976414e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.46484375, + "logps/chosen": -942.5, + "logps/rejected": -1080.0, + "loss": 0.3574, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0751953125, + "rewards/margins": 11.0, + "rewards/rejected": -8.921875, + "step": 3658 + }, + { + "epoch": 0.7260280767895233, + "grad_norm": 28.973239838834672, + "learning_rate": 2.908703882711349e-07, + "logits/chosen": 3.72265625, + "logits/rejected": 3.609375, + "logps/chosen": -844.0, + "logps/rejected": -647.5, + "loss": 0.42, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.56640625, + "rewards/margins": 7.8671875, + "rewards/rejected": -5.296875, + "step": 3659 + }, + { + "epoch": 0.7262264993303239, + "grad_norm": 25.074572525837105, + "learning_rate": 2.9061564487237757e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 3.87109375, + "logps/chosen": -1109.0, + "logps/rejected": -679.0, + "loss": 0.2824, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.89453125, + "rewards/margins": 8.890625, + "rewards/rejected": -6.0078125, + "step": 3660 + }, + { + "epoch": 0.7264249218711245, + "grad_norm": 35.311559212869135, + "learning_rate": 2.9036102589568766e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 4.13671875, + "logps/chosen": -735.5, + "logps/rejected": -613.0, + "loss": 0.4307, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.2099609375, + "rewards/margins": 6.734375, + "rewards/rejected": -4.50390625, + "step": 3661 + }, + { + "epoch": 0.7266233444119252, + "grad_norm": 33.34258442176599, + "learning_rate": 2.901065314632017e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.07421875, + "logps/chosen": -1348.0, + "logps/rejected": -718.5, + "loss": 0.4068, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.08984375, + "rewards/margins": 9.1484375, + "rewards/rejected": -6.0390625, + "step": 3662 + }, + { + "epoch": 0.7268217669527258, + "grad_norm": 32.517244654064065, + "learning_rate": 2.898521616969961e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 3.515625, + "logps/chosen": -894.0, + "logps/rejected": -679.0, + "loss": 0.431, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.21875, + "rewards/margins": 6.8984375, + "rewards/rejected": -4.69921875, + "step": 3663 + }, + { + "epoch": 0.7270201894935264, + "grad_norm": 39.75650023390264, + "learning_rate": 2.895979167190874e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 3.703125, + "logps/chosen": -1178.0, + "logps/rejected": -950.0, + "loss": 0.3504, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.419921875, + "rewards/margins": 7.7734375, + "rewards/rejected": -5.34375, + "step": 3664 + }, + { + "epoch": 0.7272186120343271, + "grad_norm": 30.026043018537692, + "learning_rate": 2.8934379665143253e-07, + "logits/chosen": 3.73046875, + "logits/rejected": 3.8203125, + "logps/chosen": -1228.0, + "logps/rejected": -951.0, + "loss": 0.3662, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.521484375, + "rewards/margins": 10.015625, + "rewards/rejected": -7.484375, + "step": 3665 + }, + { + "epoch": 0.7274170345751277, + "grad_norm": 30.180681259400153, + "learning_rate": 2.890898016159283e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 4.265625, + "logps/chosen": -1057.0, + "logps/rejected": -1053.0, + "loss": 0.258, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.95703125, + "rewards/margins": 10.1875, + "rewards/rejected": -7.234375, + "step": 3666 + }, + { + "epoch": 0.7276154571159283, + "grad_norm": 27.38828013060287, + "learning_rate": 2.888359317344114e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 3.94921875, + "logps/chosen": -768.0, + "logps/rejected": -507.0, + "loss": 0.347, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.99609375, + "rewards/margins": 7.546875, + "rewards/rejected": -4.5546875, + "step": 3667 + }, + { + "epoch": 0.7278138796567291, + "grad_norm": 39.3298891603469, + "learning_rate": 2.885821871286591e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 4.2109375, + "logps/chosen": -970.0, + "logps/rejected": -583.0, + "loss": 0.5859, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.11083984375, + "rewards/margins": 5.22265625, + "rewards/rejected": -4.115234375, + "step": 3668 + }, + { + "epoch": 0.7280123021975297, + "grad_norm": 22.54679056611206, + "learning_rate": 2.8832856792038794e-07, + "logits/chosen": 4.43359375, + "logits/rejected": 4.08203125, + "logps/chosen": -1082.0, + "logps/rejected": -629.5, + "loss": 0.4181, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.720703125, + "rewards/margins": 8.03515625, + "rewards/rejected": -5.296875, + "step": 3669 + }, + { + "epoch": 0.7282107247383303, + "grad_norm": 35.82375711081394, + "learning_rate": 2.8807507423125476e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.78515625, + "logps/chosen": -896.0, + "logps/rejected": -812.5, + "loss": 0.4436, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6796875, + "rewards/margins": 6.578125, + "rewards/rejected": -3.90234375, + "step": 3670 + }, + { + "epoch": 0.7284091472791309, + "grad_norm": 27.582644829217802, + "learning_rate": 2.8782170618285566e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 4.20703125, + "logps/chosen": -1294.0, + "logps/rejected": -852.5, + "loss": 0.3637, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.771484375, + "rewards/margins": 9.3359375, + "rewards/rejected": -6.5546875, + "step": 3671 + }, + { + "epoch": 0.7286075698199316, + "grad_norm": 33.80745480046786, + "learning_rate": 2.8756846389672706e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 3.87890625, + "logps/chosen": -833.0, + "logps/rejected": -649.0, + "loss": 0.3641, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.907470703125, + "rewards/margins": 7.7265625, + "rewards/rejected": -5.828125, + "step": 3672 + }, + { + "epoch": 0.7288059923607322, + "grad_norm": 35.51111556454564, + "learning_rate": 2.8731534749434464e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 3.84375, + "logps/chosen": -1406.0, + "logps/rejected": -838.0, + "loss": 0.304, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.94140625, + "rewards/margins": 9.875, + "rewards/rejected": -6.9296875, + "step": 3673 + }, + { + "epoch": 0.7290044149015328, + "grad_norm": 35.51486054467648, + "learning_rate": 2.8706235709712377e-07, + "logits/chosen": 4.40234375, + "logits/rejected": 4.4140625, + "logps/chosen": -1004.0, + "logps/rejected": -769.0, + "loss": 0.4273, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.51171875, + "rewards/margins": 17.640625, + "rewards/rejected": -14.13671875, + "step": 3674 + }, + { + "epoch": 0.7292028374423335, + "grad_norm": 30.477633742705283, + "learning_rate": 2.8680949282641984e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 4.1953125, + "logps/chosen": -1178.0, + "logps/rejected": -799.5, + "loss": 0.4067, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.802734375, + "rewards/margins": 8.5546875, + "rewards/rejected": -5.75390625, + "step": 3675 + }, + { + "epoch": 0.7294012599831341, + "grad_norm": 32.9176731271737, + "learning_rate": 2.865567548035268e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 3.81640625, + "logps/chosen": -1224.0, + "logps/rejected": -844.0, + "loss": 0.4475, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.314453125, + "rewards/margins": 7.234375, + "rewards/rejected": -4.91796875, + "step": 3676 + }, + { + "epoch": 0.7295996825239347, + "grad_norm": 34.993638599966744, + "learning_rate": 2.86304143149679e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 3.6953125, + "logps/chosen": -1000.0, + "logps/rejected": -1628.0, + "loss": 0.4956, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.771484375, + "rewards/margins": 8.93359375, + "rewards/rejected": -8.138671875, + "step": 3677 + }, + { + "epoch": 0.7297981050647353, + "grad_norm": 27.17161843491887, + "learning_rate": 2.860516579860496e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 4.16015625, + "logps/chosen": -1002.0, + "logps/rejected": -674.5, + "loss": 0.3937, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.751953125, + "rewards/margins": 7.734375, + "rewards/rejected": -4.98046875, + "step": 3678 + }, + { + "epoch": 0.729996527605536, + "grad_norm": 36.92593646908051, + "learning_rate": 2.8579929943375115e-07, + "logits/chosen": 4.05078125, + "logits/rejected": 3.88671875, + "logps/chosen": -1054.0, + "logps/rejected": -798.0, + "loss": 0.5422, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.58984375, + "rewards/margins": 6.515625, + "rewards/rejected": -4.92578125, + "step": 3679 + }, + { + "epoch": 0.7301949501463366, + "grad_norm": 29.71547627239916, + "learning_rate": 2.85547067613836e-07, + "logits/chosen": 4.42578125, + "logits/rejected": 4.29296875, + "logps/chosen": -1233.0, + "logps/rejected": -893.0, + "loss": 0.3048, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.67578125, + "rewards/margins": 9.015625, + "rewards/rejected": -6.3515625, + "step": 3680 + }, + { + "epoch": 0.7303933726871372, + "grad_norm": 31.32521757274104, + "learning_rate": 2.8529496264729467e-07, + "logits/chosen": 3.65234375, + "logits/rejected": 3.78125, + "logps/chosen": -893.0, + "logps/rejected": -836.0, + "loss": 0.3728, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.21875, + "rewards/margins": 8.2578125, + "rewards/rejected": -6.0546875, + "step": 3681 + }, + { + "epoch": 0.7305917952279379, + "grad_norm": 29.616436293369656, + "learning_rate": 2.8504298465505795e-07, + "logits/chosen": 3.46484375, + "logits/rejected": 3.9453125, + "logps/chosen": -662.0, + "logps/rejected": -1009.0, + "loss": 0.3736, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.31640625, + "rewards/margins": 10.03125, + "rewards/rejected": -6.7265625, + "step": 3682 + }, + { + "epoch": 0.7307902177687385, + "grad_norm": 26.242788699410557, + "learning_rate": 2.8479113375799483e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 3.796875, + "logps/chosen": -962.0, + "logps/rejected": -591.0, + "loss": 0.4642, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.51171875, + "rewards/margins": 6.953125, + "rewards/rejected": -4.44921875, + "step": 3683 + }, + { + "epoch": 0.7309886403095391, + "grad_norm": 19.36178870829269, + "learning_rate": 2.845394100769138e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 3.99609375, + "logps/chosen": -1121.0, + "logps/rejected": -844.0, + "loss": 0.2711, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.5234375, + "rewards/margins": 9.78125, + "rewards/rejected": -6.265625, + "step": 3684 + }, + { + "epoch": 0.7311870628503399, + "grad_norm": 29.350268308123592, + "learning_rate": 2.842878137325626e-07, + "logits/chosen": 3.890625, + "logits/rejected": 4.171875, + "logps/chosen": -721.0, + "logps/rejected": -1637.0, + "loss": 0.4399, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.98828125, + "rewards/margins": 10.2578125, + "rewards/rejected": -8.265625, + "step": 3685 + }, + { + "epoch": 0.7313854853911405, + "grad_norm": 34.81986559730052, + "learning_rate": 2.84036344845627e-07, + "logits/chosen": 3.9375, + "logits/rejected": 3.34375, + "logps/chosen": -757.5, + "logps/rejected": -669.0, + "loss": 0.3801, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.22265625, + "rewards/margins": 6.69921875, + "rewards/rejected": -4.484375, + "step": 3686 + }, + { + "epoch": 0.7315839079319411, + "grad_norm": 33.444659861761245, + "learning_rate": 2.837850035367324e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.1875, + "logps/chosen": -983.0, + "logps/rejected": -631.5, + "loss": 0.3991, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.517578125, + "rewards/margins": 7.296875, + "rewards/rejected": -4.7890625, + "step": 3687 + }, + { + "epoch": 0.7317823304727417, + "grad_norm": 20.794143879028915, + "learning_rate": 2.835337899264429e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.3515625, + "logps/chosen": -970.0, + "logps/rejected": -861.5, + "loss": 0.3654, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.404296875, + "rewards/margins": 12.9765625, + "rewards/rejected": -10.5859375, + "step": 3688 + }, + { + "epoch": 0.7319807530135424, + "grad_norm": 33.74726581286156, + "learning_rate": 2.83282704135261e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 3.77734375, + "logps/chosen": -761.5, + "logps/rejected": -611.0, + "loss": 0.4164, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.2333984375, + "rewards/margins": 8.0390625, + "rewards/rejected": -5.796875, + "step": 3689 + }, + { + "epoch": 0.732179175554343, + "grad_norm": 32.628731849966115, + "learning_rate": 2.8303174628362813e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 4.00390625, + "logps/chosen": -1336.0, + "logps/rejected": -850.0, + "loss": 0.2181, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.14453125, + "rewards/margins": 11.21875, + "rewards/rejected": -8.078125, + "step": 3690 + }, + { + "epoch": 0.7323775980951436, + "grad_norm": 37.78642091936126, + "learning_rate": 2.8278091649192437e-07, + "logits/chosen": 3.41015625, + "logits/rejected": 3.26171875, + "logps/chosen": -983.0, + "logps/rejected": -772.0, + "loss": 0.3985, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.921875, + "rewards/margins": 7.9375, + "rewards/rejected": -6.0, + "step": 3691 + }, + { + "epoch": 0.7325760206359443, + "grad_norm": 33.42208961512877, + "learning_rate": 2.8253021488046826e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 3.46875, + "logps/chosen": -1135.0, + "logps/rejected": -643.0, + "loss": 0.378, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.259765625, + "rewards/margins": 8.1171875, + "rewards/rejected": -4.85546875, + "step": 3692 + }, + { + "epoch": 0.7327744431767449, + "grad_norm": 18.599692348634232, + "learning_rate": 2.8227964156951677e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 4.15625, + "logps/chosen": -1073.0, + "logps/rejected": -1250.0, + "loss": 0.2629, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.70703125, + "rewards/margins": 11.890625, + "rewards/rejected": -8.1953125, + "step": 3693 + }, + { + "epoch": 0.7329728657175455, + "grad_norm": 28.487246349597875, + "learning_rate": 2.8202919667926583e-07, + "logits/chosen": 3.796875, + "logits/rejected": 3.4609375, + "logps/chosen": -1021.0, + "logps/rejected": -643.0, + "loss": 0.2095, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.41015625, + "rewards/margins": 8.890625, + "rewards/rejected": -5.484375, + "step": 3694 + }, + { + "epoch": 0.7331712882583461, + "grad_norm": 24.413114077761485, + "learning_rate": 2.817788803298492e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.0390625, + "logps/chosen": -967.5, + "logps/rejected": -698.5, + "loss": 0.406, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.740234375, + "rewards/margins": 8.265625, + "rewards/rejected": -5.53125, + "step": 3695 + }, + { + "epoch": 0.7333697107991468, + "grad_norm": 31.88221081690214, + "learning_rate": 2.815286926413391e-07, + "logits/chosen": 3.62109375, + "logits/rejected": 3.9140625, + "logps/chosen": -855.5, + "logps/rejected": -866.0, + "loss": 0.5646, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.314453125, + "rewards/margins": 6.30859375, + "rewards/rejected": -3.9833984375, + "step": 3696 + }, + { + "epoch": 0.7335681333399474, + "grad_norm": 33.248230401095924, + "learning_rate": 2.812786337337463e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 3.9921875, + "logps/chosen": -783.0, + "logps/rejected": -582.0, + "loss": 0.468, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.50390625, + "rewards/margins": 6.078125, + "rewards/rejected": -3.5703125, + "step": 3697 + }, + { + "epoch": 0.733766555880748, + "grad_norm": 25.14874645629062, + "learning_rate": 2.8102870372701947e-07, + "logits/chosen": 4.0625, + "logits/rejected": 4.1171875, + "logps/chosen": -862.0, + "logps/rejected": -850.5, + "loss": 0.4764, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.76171875, + "rewards/margins": 6.86328125, + "rewards/rejected": -5.09375, + "step": 3698 + }, + { + "epoch": 0.7339649784215487, + "grad_norm": 34.329405773852464, + "learning_rate": 2.8077890274104556e-07, + "logits/chosen": 3.68359375, + "logits/rejected": 3.421875, + "logps/chosen": -987.0, + "logps/rejected": -655.0, + "loss": 0.4604, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.826171875, + "rewards/margins": 7.859375, + "rewards/rejected": -5.03125, + "step": 3699 + }, + { + "epoch": 0.7341634009623493, + "grad_norm": 36.09885165337002, + "learning_rate": 2.8052923089564983e-07, + "logits/chosen": 3.79296875, + "logits/rejected": 3.95703125, + "logps/chosen": -705.0, + "logps/rejected": -707.0, + "loss": 0.4289, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.919921875, + "rewards/margins": 7.78125, + "rewards/rejected": -5.8515625, + "step": 3700 + }, + { + "epoch": 0.7343618235031499, + "grad_norm": 23.61244725720526, + "learning_rate": 2.8027968831059544e-07, + "logits/chosen": 3.84375, + "logits/rejected": 3.6015625, + "logps/chosen": -1056.5, + "logps/rejected": -672.0, + "loss": 0.3472, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.46484375, + "rewards/margins": 8.515625, + "rewards/rejected": -6.03125, + "step": 3701 + }, + { + "epoch": 0.7345602460439505, + "grad_norm": 23.74898147020761, + "learning_rate": 2.800302751055834e-07, + "logits/chosen": 3.80859375, + "logits/rejected": 3.67578125, + "logps/chosen": -981.0, + "logps/rejected": -654.0, + "loss": 0.3586, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.015625, + "rewards/margins": 7.7578125, + "rewards/rejected": -4.74609375, + "step": 3702 + }, + { + "epoch": 0.7347586685847513, + "grad_norm": 30.328594597676357, + "learning_rate": 2.797809914002529e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 3.765625, + "logps/chosen": -1056.0, + "logps/rejected": -538.5, + "loss": 0.3654, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.529296875, + "rewards/margins": 7.234375, + "rewards/rejected": -4.71484375, + "step": 3703 + }, + { + "epoch": 0.7349570911255519, + "grad_norm": 29.81145880918147, + "learning_rate": 2.795318373141807e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.36328125, + "logps/chosen": -1109.0, + "logps/rejected": -801.0, + "loss": 0.4546, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.818359375, + "rewards/margins": 6.7109375, + "rewards/rejected": -5.890625, + "step": 3704 + }, + { + "epoch": 0.7351555136663525, + "grad_norm": 30.862496725396078, + "learning_rate": 2.7928281296688195e-07, + "logits/chosen": 3.765625, + "logits/rejected": 3.88671875, + "logps/chosen": -1370.0, + "logps/rejected": -990.0, + "loss": 0.2544, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.31640625, + "rewards/margins": 10.6171875, + "rewards/rejected": -7.3125, + "step": 3705 + }, + { + "epoch": 0.7353539362071532, + "grad_norm": 36.141286874518556, + "learning_rate": 2.7903391847780875e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.0546875, + "logps/chosen": -1184.0, + "logps/rejected": -818.5, + "loss": 0.3444, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.142333984375, + "rewards/margins": 8.3984375, + "rewards/rejected": -6.2421875, + "step": 3706 + }, + { + "epoch": 0.7355523587479538, + "grad_norm": 33.2131766027356, + "learning_rate": 2.787851539663518e-07, + "logits/chosen": 4.609375, + "logits/rejected": 4.578125, + "logps/chosen": -1290.0, + "logps/rejected": -961.0, + "loss": 0.3869, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.6875, + "rewards/margins": 9.0, + "rewards/rejected": -5.296875, + "step": 3707 + }, + { + "epoch": 0.7357507812887544, + "grad_norm": 29.398630612567004, + "learning_rate": 2.7853651955183877e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 3.9296875, + "logps/chosen": -1460.0, + "logps/rejected": -856.0, + "loss": 0.4177, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8779296875, + "rewards/margins": 18.734375, + "rewards/rejected": -16.8828125, + "step": 3708 + }, + { + "epoch": 0.7359492038295551, + "grad_norm": 41.71235556425276, + "learning_rate": 2.7828801535353507e-07, + "logits/chosen": 3.76171875, + "logits/rejected": 3.72265625, + "logps/chosen": -1293.0, + "logps/rejected": -791.5, + "loss": 0.2734, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.85546875, + "rewards/margins": 8.65625, + "rewards/rejected": -5.796875, + "step": 3709 + }, + { + "epoch": 0.7361476263703557, + "grad_norm": 35.86508507826924, + "learning_rate": 2.780396414906443e-07, + "logits/chosen": 3.96875, + "logits/rejected": 4.046875, + "logps/chosen": -1002.0, + "logps/rejected": -853.0, + "loss": 0.3311, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.771484375, + "rewards/margins": 9.53125, + "rewards/rejected": -6.75, + "step": 3710 + }, + { + "epoch": 0.7363460489111563, + "grad_norm": 36.704414991840345, + "learning_rate": 2.7779139808230623e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 4.12109375, + "logps/chosen": -1029.0, + "logps/rejected": -830.0, + "loss": 0.4391, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8671875, + "rewards/margins": 9.140625, + "rewards/rejected": -6.26171875, + "step": 3711 + }, + { + "epoch": 0.7365444714519569, + "grad_norm": 26.20140734490092, + "learning_rate": 2.775432852475995e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 3.80078125, + "logps/chosen": -954.0, + "logps/rejected": -707.0, + "loss": 0.299, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1015625, + "rewards/margins": 9.84375, + "rewards/rejected": -6.734375, + "step": 3712 + }, + { + "epoch": 0.7367428939927576, + "grad_norm": 25.736140807236566, + "learning_rate": 2.772953031055388e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 4.4296875, + "logps/chosen": -1074.0, + "logps/rejected": -1481.0, + "loss": 0.3769, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.65625, + "rewards/margins": 9.5546875, + "rewards/rejected": -5.8828125, + "step": 3713 + }, + { + "epoch": 0.7369413165335582, + "grad_norm": 33.811994858310385, + "learning_rate": 2.7704745177507715e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.08984375, + "logps/chosen": -834.0, + "logps/rejected": -942.0, + "loss": 0.3373, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.58203125, + "rewards/margins": 8.859375, + "rewards/rejected": -6.2734375, + "step": 3714 + }, + { + "epoch": 0.7371397390743588, + "grad_norm": 31.014526965411253, + "learning_rate": 2.767997313751045e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.18359375, + "logps/chosen": -992.0, + "logps/rejected": -799.0, + "loss": 0.469, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.526123046875, + "rewards/margins": 7.44921875, + "rewards/rejected": -4.90234375, + "step": 3715 + }, + { + "epoch": 0.7373381616151595, + "grad_norm": 36.101967530701614, + "learning_rate": 2.7655214202444755e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.65625, + "logps/chosen": -1152.0, + "logps/rejected": -950.0, + "loss": 0.3447, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.390625, + "rewards/margins": 8.78125, + "rewards/rejected": -6.390625, + "step": 3716 + }, + { + "epoch": 0.7375365841559601, + "grad_norm": 31.558375647675664, + "learning_rate": 2.763046838418711e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.08984375, + "logps/chosen": -1173.0, + "logps/rejected": -1581.0, + "loss": 0.282, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.849609375, + "rewards/margins": 12.390625, + "rewards/rejected": -8.54296875, + "step": 3717 + }, + { + "epoch": 0.7377350066967607, + "grad_norm": 27.973442388210373, + "learning_rate": 2.760573569460757e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.1484375, + "logps/chosen": -1081.0, + "logps/rejected": -644.0, + "loss": 0.3849, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.62109375, + "rewards/margins": 7.765625, + "rewards/rejected": -5.1328125, + "step": 3718 + }, + { + "epoch": 0.7379334292375613, + "grad_norm": 23.66606293936398, + "learning_rate": 2.7581016145570027e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.04296875, + "logps/chosen": -1026.0, + "logps/rejected": -632.0, + "loss": 0.3736, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.15234375, + "rewards/margins": 8.484375, + "rewards/rejected": -5.3515625, + "step": 3719 + }, + { + "epoch": 0.738131851778362, + "grad_norm": 29.608690699722505, + "learning_rate": 2.7556309748931984e-07, + "logits/chosen": 3.75390625, + "logits/rejected": 3.8046875, + "logps/chosen": -1141.0, + "logps/rejected": -683.5, + "loss": 0.3566, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.51812744140625, + "rewards/margins": 8.1484375, + "rewards/rejected": -5.63671875, + "step": 3720 + }, + { + "epoch": 0.7383302743191626, + "grad_norm": 31.33710813900463, + "learning_rate": 2.753161651654466e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.15234375, + "logps/chosen": -995.0, + "logps/rejected": -783.0, + "loss": 0.3305, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5576171875, + "rewards/margins": 9.1171875, + "rewards/rejected": -6.541015625, + "step": 3721 + }, + { + "epoch": 0.7385286968599633, + "grad_norm": 28.205531544111896, + "learning_rate": 2.750693646025297e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.859375, + "logps/chosen": -912.0, + "logps/rejected": -1451.0, + "loss": 0.4189, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.04296875, + "rewards/margins": 10.359375, + "rewards/rejected": -8.2890625, + "step": 3722 + }, + { + "epoch": 0.738727119400764, + "grad_norm": 36.84813919245299, + "learning_rate": 2.7482269591895494e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 3.59375, + "logps/chosen": -1868.0, + "logps/rejected": -692.0, + "loss": 0.5388, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.630859375, + "rewards/margins": 11.5703125, + "rewards/rejected": -13.203125, + "step": 3723 + }, + { + "epoch": 0.7389255419415646, + "grad_norm": 32.57196985348673, + "learning_rate": 2.7457615923304476e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 4.1796875, + "logps/chosen": -1107.0, + "logps/rejected": -983.0, + "loss": 0.5799, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.806640625, + "rewards/margins": 7.76171875, + "rewards/rejected": -5.9453125, + "step": 3724 + }, + { + "epoch": 0.7391239644823652, + "grad_norm": 37.05480613558677, + "learning_rate": 2.743297546630587e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 3.95703125, + "logps/chosen": -885.0, + "logps/rejected": -730.0, + "loss": 0.4827, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.96484375, + "rewards/margins": 7.3359375, + "rewards/rejected": -5.375, + "step": 3725 + }, + { + "epoch": 0.7393223870231659, + "grad_norm": 37.81278237734385, + "learning_rate": 2.740834823271926e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 3.99609375, + "logps/chosen": -1178.0, + "logps/rejected": -1081.0, + "loss": 0.435, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.052734375, + "rewards/margins": 9.359375, + "rewards/rejected": -6.29296875, + "step": 3726 + }, + { + "epoch": 0.7395208095639665, + "grad_norm": 39.838334592200226, + "learning_rate": 2.7383734234357876e-07, + "logits/chosen": 3.609375, + "logits/rejected": 3.82421875, + "logps/chosen": -801.0, + "logps/rejected": -922.0, + "loss": 0.5127, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.900390625, + "rewards/margins": 6.0859375, + "rewards/rejected": -4.1875, + "step": 3727 + }, + { + "epoch": 0.7397192321047671, + "grad_norm": 26.806080804145978, + "learning_rate": 2.735913348302862e-07, + "logits/chosen": 3.921875, + "logits/rejected": 4.140625, + "logps/chosen": -820.5, + "logps/rejected": -688.0, + "loss": 0.4198, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.208984375, + "rewards/margins": 6.73046875, + "rewards/rejected": -4.51953125, + "step": 3728 + }, + { + "epoch": 0.7399176546455677, + "grad_norm": 24.63839591711473, + "learning_rate": 2.7334545990532034e-07, + "logits/chosen": 4.078125, + "logits/rejected": 3.76953125, + "logps/chosen": -848.0, + "logps/rejected": -563.0, + "loss": 0.315, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.248046875, + "rewards/margins": 8.265625, + "rewards/rejected": -5.03125, + "step": 3729 + }, + { + "epoch": 0.7401160771863684, + "grad_norm": 34.549428049297475, + "learning_rate": 2.7309971768662334e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.6875, + "logps/chosen": -808.0, + "logps/rejected": -1019.0, + "loss": 0.408, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.546875, + "rewards/margins": 8.234375, + "rewards/rejected": -5.703125, + "step": 3730 + }, + { + "epoch": 0.740314499727169, + "grad_norm": 41.8127224557972, + "learning_rate": 2.7285410829207277e-07, + "logits/chosen": 4.09375, + "logits/rejected": 4.11328125, + "logps/chosen": -1100.0, + "logps/rejected": -844.0, + "loss": 0.3954, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.71875, + "rewards/margins": 6.4765625, + "rewards/rejected": -3.75390625, + "step": 3731 + }, + { + "epoch": 0.7405129222679696, + "grad_norm": 42.265911398380716, + "learning_rate": 2.726086318394836e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 3.94921875, + "logps/chosen": -893.0, + "logps/rejected": -1324.5, + "loss": 0.5204, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.091796875, + "rewards/margins": 17.21875, + "rewards/rejected": -16.125, + "step": 3732 + }, + { + "epoch": 0.7407113448087703, + "grad_norm": 44.04552610698464, + "learning_rate": 2.723632884466064e-07, + "logits/chosen": 3.765625, + "logits/rejected": 4.0546875, + "logps/chosen": -693.0, + "logps/rejected": -989.0, + "loss": 0.5115, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.384765625, + "rewards/margins": 7.36328125, + "rewards/rejected": -4.9765625, + "step": 3733 + }, + { + "epoch": 0.7409097673495709, + "grad_norm": 21.248455742863168, + "learning_rate": 2.7211807823112797e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 3.99609375, + "logps/chosen": -979.0, + "logps/rejected": -654.0, + "loss": 0.4432, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.810546875, + "rewards/margins": 6.6533203125, + "rewards/rejected": -3.85009765625, + "step": 3734 + }, + { + "epoch": 0.7411081898903715, + "grad_norm": 35.3445013261497, + "learning_rate": 2.7187300131067135e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.109375, + "logps/chosen": -1209.0, + "logps/rejected": -782.0, + "loss": 0.541, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.4266357421875, + "rewards/margins": 6.26171875, + "rewards/rejected": -3.82421875, + "step": 3735 + }, + { + "epoch": 0.7413066124311721, + "grad_norm": 32.53613925467749, + "learning_rate": 2.7162805780279527e-07, + "logits/chosen": 3.72265625, + "logits/rejected": 3.59765625, + "logps/chosen": -1133.0, + "logps/rejected": -715.5, + "loss": 0.3881, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.65380859375, + "rewards/margins": 8.3828125, + "rewards/rejected": -5.71484375, + "step": 3736 + }, + { + "epoch": 0.7415050349719728, + "grad_norm": 44.34024258932945, + "learning_rate": 2.7138324782499554e-07, + "logits/chosen": 3.5625, + "logits/rejected": 3.26953125, + "logps/chosen": -1002.0, + "logps/rejected": -640.0, + "loss": 0.4188, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.35888671875, + "rewards/margins": 7.5390625, + "rewards/rejected": -6.1796875, + "step": 3737 + }, + { + "epoch": 0.7417034575127734, + "grad_norm": 36.33300103906817, + "learning_rate": 2.711385714947023e-07, + "logits/chosen": 4.3828125, + "logits/rejected": 4.19921875, + "logps/chosen": -895.0, + "logps/rejected": -628.0, + "loss": 0.4394, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.2236328125, + "rewards/margins": 7.84765625, + "rewards/rejected": -5.6171875, + "step": 3738 + }, + { + "epoch": 0.741901880053574, + "grad_norm": 25.28282639066259, + "learning_rate": 2.7089402892928304e-07, + "logits/chosen": 3.87109375, + "logits/rejected": 4.0078125, + "logps/chosen": -1202.0, + "logps/rejected": -668.0, + "loss": 0.4018, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.587890625, + "rewards/margins": 8.125, + "rewards/rejected": -5.546875, + "step": 3739 + }, + { + "epoch": 0.7421003025943748, + "grad_norm": 28.7415804551186, + "learning_rate": 2.706496202460402e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.09375, + "logps/chosen": -957.0, + "logps/rejected": -652.0, + "loss": 0.3539, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6015625, + "rewards/margins": 8.21875, + "rewards/rejected": -5.6171875, + "step": 3740 + }, + { + "epoch": 0.7422987251351754, + "grad_norm": 37.13812428886012, + "learning_rate": 2.704053455622122e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.2890625, + "logps/chosen": -991.0, + "logps/rejected": -1340.0, + "loss": 0.4566, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.90771484375, + "rewards/margins": 8.3046875, + "rewards/rejected": -7.37890625, + "step": 3741 + }, + { + "epoch": 0.742497147675976, + "grad_norm": 30.306330544266356, + "learning_rate": 2.7016120499497377e-07, + "logits/chosen": 4.19921875, + "logits/rejected": 4.12890625, + "logps/chosen": -727.0, + "logps/rejected": -872.0, + "loss": 0.4731, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.4296875, + "rewards/margins": 6.359375, + "rewards/rejected": -4.9296875, + "step": 3742 + }, + { + "epoch": 0.7426955702167767, + "grad_norm": 28.03400684224706, + "learning_rate": 2.69917198661434e-07, + "logits/chosen": 4.38671875, + "logits/rejected": 3.78125, + "logps/chosen": -1026.0, + "logps/rejected": -611.5, + "loss": 0.297, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.744140625, + "rewards/margins": 8.65625, + "rewards/rejected": -5.90625, + "step": 3743 + }, + { + "epoch": 0.7428939927575773, + "grad_norm": 22.867734306359992, + "learning_rate": 2.696733266786392e-07, + "logits/chosen": 4.4921875, + "logits/rejected": 4.5859375, + "logps/chosen": -1372.5, + "logps/rejected": -854.5, + "loss": 0.346, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5234375, + "rewards/margins": 9.6953125, + "rewards/rejected": -7.171875, + "step": 3744 + }, + { + "epoch": 0.7430924152983779, + "grad_norm": 36.707948419424795, + "learning_rate": 2.6942958916356994e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 3.984375, + "logps/chosen": -902.0, + "logps/rejected": -736.0, + "loss": 0.4371, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.8662109375, + "rewards/margins": 8.828125, + "rewards/rejected": -6.9765625, + "step": 3745 + }, + { + "epoch": 0.7432908378391785, + "grad_norm": 41.137807621019256, + "learning_rate": 2.6918598623314295e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 4.08203125, + "logps/chosen": -922.0, + "logps/rejected": -1550.5, + "loss": 0.2815, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.17578125, + "rewards/margins": 11.625, + "rewards/rejected": -8.453125, + "step": 3746 + }, + { + "epoch": 0.7434892603799792, + "grad_norm": 39.91670090292485, + "learning_rate": 2.689425180042102e-07, + "logits/chosen": 3.3046875, + "logits/rejected": 3.45703125, + "logps/chosen": -764.0, + "logps/rejected": -654.5, + "loss": 0.4794, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.67431640625, + "rewards/margins": 6.484375, + "rewards/rejected": -4.8046875, + "step": 3747 + }, + { + "epoch": 0.7436876829207798, + "grad_norm": 29.711252876968167, + "learning_rate": 2.686991845935589e-07, + "logits/chosen": 3.69921875, + "logits/rejected": 3.7421875, + "logps/chosen": -950.0, + "logps/rejected": -596.5, + "loss": 0.5, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.0576171875, + "rewards/margins": 6.9091796875, + "rewards/rejected": -4.84619140625, + "step": 3748 + }, + { + "epoch": 0.7438861054615804, + "grad_norm": 28.53979870793713, + "learning_rate": 2.68455986117912e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.21875, + "logps/chosen": -1011.5, + "logps/rejected": -813.0, + "loss": 0.4556, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8984375, + "rewards/margins": 8.28125, + "rewards/rejected": -5.3828125, + "step": 3749 + }, + { + "epoch": 0.7440845280023811, + "grad_norm": 30.311096075684578, + "learning_rate": 2.682129226939275e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 4.15625, + "logps/chosen": -622.0, + "logps/rejected": -675.5, + "loss": 0.4655, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.5791015625, + "rewards/margins": 5.953125, + "rewards/rejected": -4.376953125, + "step": 3750 + }, + { + "epoch": 0.7442829505431817, + "grad_norm": 29.563966579526408, + "learning_rate": 2.679699944381986e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 3.69140625, + "logps/chosen": -1160.0, + "logps/rejected": -595.5, + "loss": 0.3101, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.595703125, + "rewards/margins": 8.734375, + "rewards/rejected": -6.1640625, + "step": 3751 + }, + { + "epoch": 0.7444813730839823, + "grad_norm": 19.953088928969926, + "learning_rate": 2.677272014672535e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 3.97265625, + "logps/chosen": -1064.0, + "logps/rejected": -809.0, + "loss": 0.3581, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.271484375, + "rewards/margins": 8.7734375, + "rewards/rejected": -6.5078125, + "step": 3752 + }, + { + "epoch": 0.7446797956247829, + "grad_norm": 27.16045377435796, + "learning_rate": 2.674845438975557e-07, + "logits/chosen": 4.484375, + "logits/rejected": 4.515625, + "logps/chosen": -993.0, + "logps/rejected": -1744.5, + "loss": 0.3615, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.64453125, + "rewards/margins": 11.90625, + "rewards/rejected": -9.2578125, + "step": 3753 + }, + { + "epoch": 0.7448782181655836, + "grad_norm": 31.255453901562856, + "learning_rate": 2.672420218455037e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.234375, + "logps/chosen": -1246.0, + "logps/rejected": -871.5, + "loss": 0.3367, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.40478515625, + "rewards/margins": 9.4609375, + "rewards/rejected": -7.0625, + "step": 3754 + }, + { + "epoch": 0.7450766407063842, + "grad_norm": 34.8145154282518, + "learning_rate": 2.669996354274314e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.3203125, + "logps/chosen": -1050.0, + "logps/rejected": -813.5, + "loss": 0.3384, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.32421875, + "rewards/margins": 8.8828125, + "rewards/rejected": -5.53515625, + "step": 3755 + }, + { + "epoch": 0.7452750632471848, + "grad_norm": 25.068895060881697, + "learning_rate": 2.6675738475960673e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 4.0546875, + "logps/chosen": -1454.0, + "logps/rejected": -1495.5, + "loss": 0.2677, + "rewards/accuracies": 0.96875, + "rewards/chosen": 4.1328125, + "rewards/margins": 10.890625, + "rewards/rejected": -6.765625, + "step": 3756 + }, + { + "epoch": 0.7454734857879856, + "grad_norm": 38.20235233931893, + "learning_rate": 2.665152699582334e-07, + "logits/chosen": 3.9375, + "logits/rejected": 4.10546875, + "logps/chosen": -991.0, + "logps/rejected": -829.5, + "loss": 0.4271, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3515625, + "rewards/margins": 6.6171875, + "rewards/rejected": -4.265625, + "step": 3757 + }, + { + "epoch": 0.7456719083287862, + "grad_norm": 29.61300885721846, + "learning_rate": 2.6627329113944946e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 3.91015625, + "logps/chosen": -929.0, + "logps/rejected": -709.0, + "loss": 0.3664, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.365234375, + "rewards/margins": 8.609375, + "rewards/rejected": -6.25, + "step": 3758 + }, + { + "epoch": 0.7458703308695868, + "grad_norm": 28.050978033496346, + "learning_rate": 2.6603144841932786e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 3.91796875, + "logps/chosen": -1259.0, + "logps/rejected": -859.0, + "loss": 0.3808, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.126953125, + "rewards/margins": 14.2421875, + "rewards/rejected": -11.10546875, + "step": 3759 + }, + { + "epoch": 0.7460687534103875, + "grad_norm": 25.611893137111238, + "learning_rate": 2.6578974191387614e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 3.95703125, + "logps/chosen": -1368.0, + "logps/rejected": -803.0, + "loss": 0.4559, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.27294921875, + "rewards/margins": 8.6953125, + "rewards/rejected": -5.41015625, + "step": 3760 + }, + { + "epoch": 0.7462671759511881, + "grad_norm": 32.5884428884543, + "learning_rate": 2.655481717390367e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 3.8203125, + "logps/chosen": -1367.0, + "logps/rejected": -1058.0, + "loss": 0.3091, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.48681640625, + "rewards/margins": 8.359375, + "rewards/rejected": -5.875, + "step": 3761 + }, + { + "epoch": 0.7464655984919887, + "grad_norm": 25.614981520579384, + "learning_rate": 2.653067380106867e-07, + "logits/chosen": 3.67578125, + "logits/rejected": 3.82421875, + "logps/chosen": -1159.0, + "logps/rejected": -799.5, + "loss": 0.32, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.57421875, + "rewards/margins": 9.3515625, + "rewards/rejected": -6.7734375, + "step": 3762 + }, + { + "epoch": 0.7466640210327893, + "grad_norm": 38.00232004950285, + "learning_rate": 2.650654408446371e-07, + "logits/chosen": 3.41015625, + "logits/rejected": 3.7421875, + "logps/chosen": -936.0, + "logps/rejected": -1906.0, + "loss": 0.4204, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.220703125, + "rewards/margins": 12.1015625, + "rewards/rejected": -9.8828125, + "step": 3763 + }, + { + "epoch": 0.74686244357359, + "grad_norm": 34.931592361217334, + "learning_rate": 2.648242803566344e-07, + "logits/chosen": 3.984375, + "logits/rejected": 3.8671875, + "logps/chosen": -1104.0, + "logps/rejected": -780.5, + "loss": 0.3078, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.91650390625, + "rewards/margins": 9.7734375, + "rewards/rejected": -6.8359375, + "step": 3764 + }, + { + "epoch": 0.7470608661143906, + "grad_norm": 30.45040887052469, + "learning_rate": 2.6458325666235865e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.16796875, + "logps/chosen": -1038.0, + "logps/rejected": -582.0, + "loss": 0.2993, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.748046875, + "rewards/margins": 8.9921875, + "rewards/rejected": -6.2421875, + "step": 3765 + }, + { + "epoch": 0.7472592886551912, + "grad_norm": 29.35396286183641, + "learning_rate": 2.643423698774246e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 3.83203125, + "logps/chosen": -814.0, + "logps/rejected": -850.0, + "loss": 0.5086, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.4853515625, + "rewards/margins": 7.2734375, + "rewards/rejected": -5.779296875, + "step": 3766 + }, + { + "epoch": 0.7474577111959919, + "grad_norm": 29.46608471071333, + "learning_rate": 2.641016201173818e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 3.96875, + "logps/chosen": -852.0, + "logps/rejected": -658.0, + "loss": 0.4557, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.556640625, + "rewards/margins": 6.73046875, + "rewards/rejected": -5.1875, + "step": 3767 + }, + { + "epoch": 0.7476561337367925, + "grad_norm": 45.93998206002605, + "learning_rate": 2.638610074977131e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 4.02734375, + "logps/chosen": -819.0, + "logps/rejected": -754.5, + "loss": 0.4831, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.953125, + "rewards/margins": 5.921875, + "rewards/rejected": -3.96875, + "step": 3768 + }, + { + "epoch": 0.7478545562775931, + "grad_norm": 22.810704445606024, + "learning_rate": 2.636205321338364e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 4.046875, + "logps/chosen": -746.0, + "logps/rejected": -498.0, + "loss": 0.5909, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.38983154296875, + "rewards/margins": 5.49609375, + "rewards/rejected": -4.109375, + "step": 3769 + }, + { + "epoch": 0.7480529788183937, + "grad_norm": 25.885821105563995, + "learning_rate": 2.6338019414110344e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 3.76171875, + "logps/chosen": -1410.0, + "logps/rejected": -733.0, + "loss": 0.3234, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.806640625, + "rewards/margins": 9.06640625, + "rewards/rejected": -6.26953125, + "step": 3770 + }, + { + "epoch": 0.7482514013591944, + "grad_norm": 35.16054367043191, + "learning_rate": 2.631399936348001e-07, + "logits/chosen": 4.390625, + "logits/rejected": 4.17578125, + "logps/chosen": -941.0, + "logps/rejected": -633.5, + "loss": 0.4766, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.2373046875, + "rewards/margins": 6.62109375, + "rewards/rejected": -4.37890625, + "step": 3771 + }, + { + "epoch": 0.748449823899995, + "grad_norm": 33.6784465539142, + "learning_rate": 2.628999307301462e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.0859375, + "logps/chosen": -1218.0, + "logps/rejected": -894.0, + "loss": 0.3275, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9130859375, + "rewards/margins": 9.15625, + "rewards/rejected": -6.2421875, + "step": 3772 + }, + { + "epoch": 0.7486482464407956, + "grad_norm": 56.13035939244355, + "learning_rate": 2.626600055422955e-07, + "logits/chosen": 3.62109375, + "logits/rejected": 3.86328125, + "logps/chosen": -849.5, + "logps/rejected": -553.0, + "loss": 0.4386, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.27734375, + "rewards/margins": 7.078125, + "rewards/rejected": -4.78515625, + "step": 3773 + }, + { + "epoch": 0.7488466689815964, + "grad_norm": 27.286498784954237, + "learning_rate": 2.624202181863363e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 3.87890625, + "logps/chosen": -1228.0, + "logps/rejected": -1253.0, + "loss": 0.4557, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0859375, + "rewards/margins": 7.3984375, + "rewards/rejected": -5.30078125, + "step": 3774 + }, + { + "epoch": 0.749045091522397, + "grad_norm": 29.228930793508873, + "learning_rate": 2.6218056877729e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 3.6875, + "logps/chosen": -1409.0, + "logps/rejected": -731.0, + "loss": 0.3412, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.58203125, + "rewards/margins": 7.7890625, + "rewards/rejected": -5.21875, + "step": 3775 + }, + { + "epoch": 0.7492435140631976, + "grad_norm": 31.07343248561626, + "learning_rate": 2.619410574301122e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 4.35546875, + "logps/chosen": -1068.0, + "logps/rejected": -1683.0, + "loss": 0.5148, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.4228515625, + "rewards/margins": 10.03125, + "rewards/rejected": -8.6640625, + "step": 3776 + }, + { + "epoch": 0.7494419366039982, + "grad_norm": 29.98968756480632, + "learning_rate": 2.617016842596923e-07, + "logits/chosen": 3.69921875, + "logits/rejected": 3.87109375, + "logps/chosen": -831.5, + "logps/rejected": -877.0, + "loss": 0.5007, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.3994140625, + "rewards/margins": 6.3828125, + "rewards/rejected": -4.99609375, + "step": 3777 + }, + { + "epoch": 0.7496403591447989, + "grad_norm": 39.075024049848906, + "learning_rate": 2.614624493808534e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.1328125, + "logps/chosen": -660.5, + "logps/rejected": -461.0, + "loss": 0.505, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.990234375, + "rewards/margins": 6.203125, + "rewards/rejected": -4.203125, + "step": 3778 + }, + { + "epoch": 0.7498387816855995, + "grad_norm": 29.06304323993199, + "learning_rate": 2.612233529083522e-07, + "logits/chosen": 3.51953125, + "logits/rejected": 3.734375, + "logps/chosen": -1106.0, + "logps/rejected": -915.0, + "loss": 0.3009, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.70703125, + "rewards/margins": 10.125, + "rewards/rejected": -7.40625, + "step": 3779 + }, + { + "epoch": 0.7500372042264001, + "grad_norm": 27.004743030307274, + "learning_rate": 2.6098439495687867e-07, + "logits/chosen": 3.76953125, + "logits/rejected": 3.9375, + "logps/chosen": -962.0, + "logps/rejected": -657.0, + "loss": 0.3606, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.728515625, + "rewards/margins": 7.828125, + "rewards/rejected": -5.1015625, + "step": 3780 + }, + { + "epoch": 0.7502356267672008, + "grad_norm": 39.07857238065921, + "learning_rate": 2.6074557564105724e-07, + "logits/chosen": 4.0625, + "logits/rejected": 3.984375, + "logps/chosen": -777.0, + "logps/rejected": -656.0, + "loss": 0.3868, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.75, + "rewards/margins": 6.4921875, + "rewards/rejected": -4.734375, + "step": 3781 + }, + { + "epoch": 0.7504340493080014, + "grad_norm": 29.806526362683204, + "learning_rate": 2.605068950754452e-07, + "logits/chosen": 3.8515625, + "logits/rejected": 3.75, + "logps/chosen": -1056.0, + "logps/rejected": -718.0, + "loss": 0.4135, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.59375, + "rewards/margins": 7.6640625, + "rewards/rejected": -5.08984375, + "step": 3782 + }, + { + "epoch": 0.750632471848802, + "grad_norm": 99.38203854397587, + "learning_rate": 2.602683533745331e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.125, + "logps/chosen": -1114.0, + "logps/rejected": -725.5, + "loss": 0.4194, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6015625, + "rewards/margins": 7.4765625, + "rewards/rejected": -4.875, + "step": 3783 + }, + { + "epoch": 0.7508308943896027, + "grad_norm": 33.694797371392845, + "learning_rate": 2.6002995065274536e-07, + "logits/chosen": 3.76171875, + "logits/rejected": 4.015625, + "logps/chosen": -1289.0, + "logps/rejected": -711.5, + "loss": 0.4044, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9453125, + "rewards/margins": 9.4375, + "rewards/rejected": -6.5078125, + "step": 3784 + }, + { + "epoch": 0.7510293169304033, + "grad_norm": 27.651670191280882, + "learning_rate": 2.5979168702443945e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.05859375, + "logps/chosen": -953.0, + "logps/rejected": -1243.0, + "loss": 0.3252, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.251953125, + "rewards/margins": 9.6640625, + "rewards/rejected": -7.421875, + "step": 3785 + }, + { + "epoch": 0.7512277394712039, + "grad_norm": 34.51310388743192, + "learning_rate": 2.5955356260390603e-07, + "logits/chosen": 4.578125, + "logits/rejected": 4.28125, + "logps/chosen": -932.0, + "logps/rejected": -796.5, + "loss": 0.5145, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.888671875, + "rewards/margins": 7.140625, + "rewards/rejected": -5.25390625, + "step": 3786 + }, + { + "epoch": 0.7514261620120045, + "grad_norm": 27.98678327493736, + "learning_rate": 2.593155775053697e-07, + "logits/chosen": 4.29296875, + "logits/rejected": 4.31640625, + "logps/chosen": -1285.0, + "logps/rejected": -954.0, + "loss": 0.3936, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2939453125, + "rewards/margins": 7.53515625, + "rewards/rejected": -7.234375, + "step": 3787 + }, + { + "epoch": 0.7516245845528052, + "grad_norm": 28.441303876243385, + "learning_rate": 2.590777318429871e-07, + "logits/chosen": 4.015625, + "logits/rejected": 4.19921875, + "logps/chosen": -1084.0, + "logps/rejected": -1170.5, + "loss": 0.3735, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.369140625, + "rewards/margins": 9.4375, + "rewards/rejected": -7.078125, + "step": 3788 + }, + { + "epoch": 0.7518230070936058, + "grad_norm": 32.56905017380288, + "learning_rate": 2.588400257308488e-07, + "logits/chosen": 4.66015625, + "logits/rejected": 4.51171875, + "logps/chosen": -884.0, + "logps/rejected": -713.5, + "loss": 0.525, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.1611328125, + "rewards/margins": 5.49609375, + "rewards/rejected": -3.33984375, + "step": 3789 + }, + { + "epoch": 0.7520214296344064, + "grad_norm": 33.8553494995198, + "learning_rate": 2.586024592829784e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 3.69921875, + "logps/chosen": -778.5, + "logps/rejected": -614.0, + "loss": 0.4237, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.85546875, + "rewards/margins": 7.453125, + "rewards/rejected": -4.609375, + "step": 3790 + }, + { + "epoch": 0.7522198521752071, + "grad_norm": 34.014035782236405, + "learning_rate": 2.583650326133318e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 3.8671875, + "logps/chosen": -971.0, + "logps/rejected": -682.0, + "loss": 0.4374, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.95703125, + "rewards/margins": 7.03125, + "rewards/rejected": -5.0703125, + "step": 3791 + }, + { + "epoch": 0.7524182747160078, + "grad_norm": 38.97830059026669, + "learning_rate": 2.58127745835799e-07, + "logits/chosen": 3.828125, + "logits/rejected": 4.09765625, + "logps/chosen": -1057.0, + "logps/rejected": -1131.0, + "loss": 0.408, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.89453125, + "rewards/margins": 9.5390625, + "rewards/rejected": -6.66796875, + "step": 3792 + }, + { + "epoch": 0.7526166972568084, + "grad_norm": 24.493274533791926, + "learning_rate": 2.5789059906420165e-07, + "logits/chosen": 4.6953125, + "logits/rejected": 4.33984375, + "logps/chosen": -803.5, + "logps/rejected": -574.0, + "loss": 0.2986, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.07421875, + "rewards/margins": 8.5390625, + "rewards/rejected": -5.46484375, + "step": 3793 + }, + { + "epoch": 0.752815119797609, + "grad_norm": 26.16839403938174, + "learning_rate": 2.5765359241229523e-07, + "logits/chosen": 4.703125, + "logits/rejected": 4.921875, + "logps/chosen": -980.0, + "logps/rejected": -1440.0, + "loss": 0.4542, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.7578125, + "rewards/margins": 10.703125, + "rewards/rejected": -6.953125, + "step": 3794 + }, + { + "epoch": 0.7530135423384097, + "grad_norm": 29.27385347585976, + "learning_rate": 2.574167259937675e-07, + "logits/chosen": 4.515625, + "logits/rejected": 4.59765625, + "logps/chosen": -815.0, + "logps/rejected": -800.0, + "loss": 0.4169, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0703125, + "rewards/margins": 7.125, + "rewards/rejected": -4.056640625, + "step": 3795 + }, + { + "epoch": 0.7532119648792103, + "grad_norm": 26.64692120243802, + "learning_rate": 2.571799999222389e-07, + "logits/chosen": 3.52734375, + "logits/rejected": 3.79296875, + "logps/chosen": -815.0, + "logps/rejected": -1167.5, + "loss": 0.3835, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.39453125, + "rewards/margins": 10.2265625, + "rewards/rejected": -7.84765625, + "step": 3796 + }, + { + "epoch": 0.7534103874200109, + "grad_norm": 27.770806021918908, + "learning_rate": 2.569434143112632e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 3.734375, + "logps/chosen": -565.0, + "logps/rejected": -480.0, + "loss": 0.6091, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.703125, + "rewards/margins": 3.953125, + "rewards/rejected": -2.25, + "step": 3797 + }, + { + "epoch": 0.7536088099608116, + "grad_norm": 28.0861448869976, + "learning_rate": 2.567069692743258e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 3.89453125, + "logps/chosen": -909.0, + "logps/rejected": -678.0, + "loss": 0.3369, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.28515625, + "rewards/margins": 7.96875, + "rewards/rejected": -4.6796875, + "step": 3798 + }, + { + "epoch": 0.7538072325016122, + "grad_norm": 30.195849977549926, + "learning_rate": 2.5647066492484557e-07, + "logits/chosen": 4.26171875, + "logits/rejected": 4.2890625, + "logps/chosen": -651.5, + "logps/rejected": -601.5, + "loss": 0.5231, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.865234375, + "rewards/margins": 5.96484375, + "rewards/rejected": -4.091796875, + "step": 3799 + }, + { + "epoch": 0.7540056550424128, + "grad_norm": 29.494721413088982, + "learning_rate": 2.562345013761731e-07, + "logits/chosen": 3.96875, + "logits/rejected": 4.0390625, + "logps/chosen": -896.0, + "logps/rejected": -1670.0, + "loss": 0.4349, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.57421875, + "rewards/margins": 10.890625, + "rewards/rejected": -8.3203125, + "step": 3800 + }, + { + "epoch": 0.7542040775832135, + "grad_norm": 25.48477011487138, + "learning_rate": 2.559984787415923e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.95703125, + "logps/chosen": -791.5, + "logps/rejected": -511.0, + "loss": 0.3406, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.744140625, + "rewards/margins": 7.5234375, + "rewards/rejected": -4.7890625, + "step": 3801 + }, + { + "epoch": 0.7544025001240141, + "grad_norm": 32.939036937056684, + "learning_rate": 2.557625971343187e-07, + "logits/chosen": 4.39453125, + "logits/rejected": 4.0703125, + "logps/chosen": -825.5, + "logps/rejected": -595.75, + "loss": 0.4756, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.19921875, + "rewards/margins": 7.6328125, + "rewards/rejected": -5.4375, + "step": 3802 + }, + { + "epoch": 0.7546009226648147, + "grad_norm": 26.855799684718246, + "learning_rate": 2.555268566675007e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.26171875, + "logps/chosen": -832.0, + "logps/rejected": -780.0, + "loss": 0.4333, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.375, + "rewards/margins": 7.421875, + "rewards/rejected": -5.03515625, + "step": 3803 + }, + { + "epoch": 0.7547993452056153, + "grad_norm": 26.91707688798118, + "learning_rate": 2.552912574542187e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.890625, + "logps/chosen": -1253.0, + "logps/rejected": -735.5, + "loss": 0.3322, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.35546875, + "rewards/margins": 9.578125, + "rewards/rejected": -6.2265625, + "step": 3804 + }, + { + "epoch": 0.754997767746416, + "grad_norm": 32.890314055441905, + "learning_rate": 2.550557996074854e-07, + "logits/chosen": 4.15625, + "logits/rejected": 3.875, + "logps/chosen": -1046.0, + "logps/rejected": -852.5, + "loss": 0.3439, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.07421875, + "rewards/margins": 9.5, + "rewards/rejected": -6.4296875, + "step": 3805 + }, + { + "epoch": 0.7551961902872166, + "grad_norm": 50.211868068643405, + "learning_rate": 2.5482048324024597e-07, + "logits/chosen": 4.5546875, + "logits/rejected": 4.375, + "logps/chosen": -1134.0, + "logps/rejected": -894.5, + "loss": 0.3096, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.08984375, + "rewards/margins": 9.4765625, + "rewards/rejected": -6.3984375, + "step": 3806 + }, + { + "epoch": 0.7553946128280172, + "grad_norm": 32.39630123064825, + "learning_rate": 2.545853084653774e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.78125, + "logps/chosen": -1106.0, + "logps/rejected": -758.5, + "loss": 0.5159, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.49609375, + "rewards/margins": 6.640625, + "rewards/rejected": -4.158203125, + "step": 3807 + }, + { + "epoch": 0.755593035368818, + "grad_norm": 34.57529763912575, + "learning_rate": 2.543502753956889e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 4.0, + "logps/chosen": -1016.0, + "logps/rejected": -1067.0, + "loss": 0.4306, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.81640625, + "rewards/margins": 8.6796875, + "rewards/rejected": -5.859375, + "step": 3808 + }, + { + "epoch": 0.7557914579096185, + "grad_norm": 26.543962641068102, + "learning_rate": 2.541153841439214e-07, + "logits/chosen": 4.23046875, + "logits/rejected": 4.1171875, + "logps/chosen": -1218.0, + "logps/rejected": -1292.0, + "loss": 0.2803, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.16015625, + "rewards/margins": 10.953125, + "rewards/rejected": -7.7890625, + "step": 3809 + }, + { + "epoch": 0.7559898804504191, + "grad_norm": 39.72238986739973, + "learning_rate": 2.5388063482274837e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 3.8515625, + "logps/chosen": -1082.0, + "logps/rejected": -730.0, + "loss": 0.3527, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.759765625, + "rewards/margins": 8.5625, + "rewards/rejected": -5.796875, + "step": 3810 + }, + { + "epoch": 0.7561883029912198, + "grad_norm": 34.37647825432663, + "learning_rate": 2.5364602754477457e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 3.953125, + "logps/chosen": -882.5, + "logps/rejected": -602.25, + "loss": 0.441, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5078125, + "rewards/margins": 5.82421875, + "rewards/rejected": -3.32421875, + "step": 3811 + }, + { + "epoch": 0.7563867255320205, + "grad_norm": 26.803910121730446, + "learning_rate": 2.5341156242253735e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 3.6015625, + "logps/chosen": -1324.0, + "logps/rejected": -923.0, + "loss": 0.2421, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.51171875, + "rewards/margins": 18.3359375, + "rewards/rejected": -16.7890625, + "step": 3812 + }, + { + "epoch": 0.7565851480728211, + "grad_norm": 39.009452825336005, + "learning_rate": 2.531772395685052e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 3.8828125, + "logps/chosen": -860.0, + "logps/rejected": -677.0, + "loss": 0.4901, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.40234375, + "rewards/margins": 6.6640625, + "rewards/rejected": -4.25390625, + "step": 3813 + }, + { + "epoch": 0.7567835706136217, + "grad_norm": 39.91853984129012, + "learning_rate": 2.5294305909507873e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 3.71875, + "logps/chosen": -1001.0, + "logps/rejected": -711.0, + "loss": 0.5013, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.296875, + "rewards/margins": 7.515625, + "rewards/rejected": -5.22265625, + "step": 3814 + }, + { + "epoch": 0.7569819931544224, + "grad_norm": 34.67774678690547, + "learning_rate": 2.5270902111459015e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 3.921875, + "logps/chosen": -780.0, + "logps/rejected": -575.5, + "loss": 0.3521, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.87109375, + "rewards/margins": 6.5859375, + "rewards/rejected": -4.703125, + "step": 3815 + }, + { + "epoch": 0.757180415695223, + "grad_norm": 37.07976429118205, + "learning_rate": 2.52475125739303e-07, + "logits/chosen": 3.77734375, + "logits/rejected": 3.796875, + "logps/chosen": -874.0, + "logps/rejected": -1290.0, + "loss": 0.4981, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.056640625, + "rewards/margins": 8.890625, + "rewards/rejected": -6.8203125, + "step": 3816 + }, + { + "epoch": 0.7573788382360236, + "grad_norm": 33.082854273234155, + "learning_rate": 2.5224137308141336e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.3046875, + "logps/chosen": -1183.0, + "logps/rejected": -943.0, + "loss": 0.4876, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.953125, + "rewards/margins": 7.234375, + "rewards/rejected": -5.28515625, + "step": 3817 + }, + { + "epoch": 0.7575772607768243, + "grad_norm": 26.386934738907062, + "learning_rate": 2.5200776325304763e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 4.19921875, + "logps/chosen": -1043.0, + "logps/rejected": -680.0, + "loss": 0.3157, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.04296875, + "rewards/margins": 7.9609375, + "rewards/rejected": -4.92578125, + "step": 3818 + }, + { + "epoch": 0.7577756833176249, + "grad_norm": 42.082952300102995, + "learning_rate": 2.517742963662648e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 3.64453125, + "logps/chosen": -1294.0, + "logps/rejected": -841.0, + "loss": 0.4589, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3232421875, + "rewards/margins": 7.4765625, + "rewards/rejected": -5.1484375, + "step": 3819 + }, + { + "epoch": 0.7579741058584255, + "grad_norm": 36.074836628006025, + "learning_rate": 2.5154097253305426e-07, + "logits/chosen": 4.05078125, + "logits/rejected": 4.07421875, + "logps/chosen": -1335.0, + "logps/rejected": -982.0, + "loss": 0.2819, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9296875, + "rewards/margins": 10.40625, + "rewards/rejected": -7.4609375, + "step": 3820 + }, + { + "epoch": 0.7581725283992261, + "grad_norm": 38.01042497608068, + "learning_rate": 2.513077918653377e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 4.1640625, + "logps/chosen": -973.0, + "logps/rejected": -1018.0, + "loss": 0.4255, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1796875, + "rewards/margins": 9.359375, + "rewards/rejected": -7.1953125, + "step": 3821 + }, + { + "epoch": 0.7583709509400268, + "grad_norm": 28.44848737186441, + "learning_rate": 2.510747544749677e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 3.96875, + "logps/chosen": -1355.0, + "logps/rejected": -873.0, + "loss": 0.3965, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.703125, + "rewards/margins": 7.796875, + "rewards/rejected": -4.103515625, + "step": 3822 + }, + { + "epoch": 0.7585693734808274, + "grad_norm": 22.13296095343675, + "learning_rate": 2.5084186047372796e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 3.6640625, + "logps/chosen": -751.5, + "logps/rejected": -580.5, + "loss": 0.2348, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.51171875, + "rewards/margins": 9.3046875, + "rewards/rejected": -5.796875, + "step": 3823 + }, + { + "epoch": 0.758767796021628, + "grad_norm": 34.72307360058488, + "learning_rate": 2.5060910997333397e-07, + "logits/chosen": 3.6328125, + "logits/rejected": 3.8046875, + "logps/chosen": -1073.0, + "logps/rejected": -1492.0, + "loss": 0.4964, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.839599609375, + "rewards/margins": 9.0, + "rewards/rejected": -7.1875, + "step": 3824 + }, + { + "epoch": 0.7589662185624287, + "grad_norm": 32.4704437128556, + "learning_rate": 2.5037650308543144e-07, + "logits/chosen": 4.0, + "logits/rejected": 3.82421875, + "logps/chosen": -950.0, + "logps/rejected": -796.5, + "loss": 0.4195, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.919921875, + "rewards/margins": 8.1796875, + "rewards/rejected": -6.265625, + "step": 3825 + }, + { + "epoch": 0.7591646411032293, + "grad_norm": 36.22209219417954, + "learning_rate": 2.501440399215983e-07, + "logits/chosen": 4.43359375, + "logits/rejected": 4.2421875, + "logps/chosen": -926.0, + "logps/rejected": -1592.0, + "loss": 0.4253, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.388671875, + "rewards/margins": 9.7109375, + "rewards/rejected": -7.3125, + "step": 3826 + }, + { + "epoch": 0.75936306364403, + "grad_norm": 28.78460465835382, + "learning_rate": 2.4991172059334265e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 4.03515625, + "logps/chosen": -913.0, + "logps/rejected": -660.0, + "loss": 0.3992, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.46484375, + "rewards/margins": 7.8984375, + "rewards/rejected": -5.4296875, + "step": 3827 + }, + { + "epoch": 0.7595614861848305, + "grad_norm": 38.496928491119185, + "learning_rate": 2.496795452121041e-07, + "logits/chosen": 3.8359375, + "logits/rejected": 3.84765625, + "logps/chosen": -954.0, + "logps/rejected": -672.0, + "loss": 0.4751, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.302734375, + "rewards/margins": 6.1484375, + "rewards/rejected": -3.8515625, + "step": 3828 + }, + { + "epoch": 0.7597599087256313, + "grad_norm": 30.998383697316253, + "learning_rate": 2.4944751388925323e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.76953125, + "logps/chosen": -1087.0, + "logps/rejected": -1363.0, + "loss": 0.4183, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.6669921875, + "rewards/margins": 8.4375, + "rewards/rejected": -6.7890625, + "step": 3829 + }, + { + "epoch": 0.7599583312664319, + "grad_norm": 30.419541816081434, + "learning_rate": 2.492156267360909e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.1484375, + "logps/chosen": -783.0, + "logps/rejected": -596.0, + "loss": 0.3898, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.580078125, + "rewards/margins": 8.234375, + "rewards/rejected": -6.6640625, + "step": 3830 + }, + { + "epoch": 0.7601567538072325, + "grad_norm": 33.73407316685781, + "learning_rate": 2.489838838638496e-07, + "logits/chosen": 3.4609375, + "logits/rejected": 3.52734375, + "logps/chosen": -810.0, + "logps/rejected": -563.5, + "loss": 0.428, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9560546875, + "rewards/margins": 6.96875, + "rewards/rejected": -5.015625, + "step": 3831 + }, + { + "epoch": 0.7603551763480332, + "grad_norm": 31.1850287989787, + "learning_rate": 2.4875228538369224e-07, + "logits/chosen": 4.484375, + "logits/rejected": 4.265625, + "logps/chosen": -1011.0, + "logps/rejected": -716.5, + "loss": 0.4395, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.71484375, + "rewards/margins": 8.2890625, + "rewards/rejected": -5.55859375, + "step": 3832 + }, + { + "epoch": 0.7605535988888338, + "grad_norm": 38.45638504935262, + "learning_rate": 2.4852083140671245e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.0, + "logps/chosen": -908.0, + "logps/rejected": -685.5, + "loss": 0.5104, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.966796875, + "rewards/margins": 7.96875, + "rewards/rejected": -6.00390625, + "step": 3833 + }, + { + "epoch": 0.7607520214296344, + "grad_norm": 32.847439568073284, + "learning_rate": 2.482895220439345e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 4.296875, + "logps/chosen": -1202.0, + "logps/rejected": -1490.0, + "loss": 0.4647, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.81982421875, + "rewards/margins": 9.9609375, + "rewards/rejected": -8.12890625, + "step": 3834 + }, + { + "epoch": 0.760950443970435, + "grad_norm": 29.54247967307125, + "learning_rate": 2.4805835740631353e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 4.078125, + "logps/chosen": -1324.0, + "logps/rejected": -755.0, + "loss": 0.2074, + "rewards/accuracies": 0.90625, + "rewards/chosen": 4.34765625, + "rewards/margins": 11.3125, + "rewards/rejected": -6.953125, + "step": 3835 + }, + { + "epoch": 0.7611488665112357, + "grad_norm": 27.384819234809033, + "learning_rate": 2.478273376047348e-07, + "logits/chosen": 4.0625, + "logits/rejected": 3.9375, + "logps/chosen": -937.0, + "logps/rejected": -732.0, + "loss": 0.5053, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2568359375, + "rewards/margins": 7.5234375, + "rewards/rejected": -6.2578125, + "step": 3836 + }, + { + "epoch": 0.7613472890520363, + "grad_norm": 42.94902453481146, + "learning_rate": 2.475964627500149e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.0, + "logps/chosen": -1054.0, + "logps/rejected": -787.5, + "loss": 0.4344, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.47265625, + "rewards/margins": 7.48046875, + "rewards/rejected": -5.0048828125, + "step": 3837 + }, + { + "epoch": 0.7615457115928369, + "grad_norm": 43.334908407728385, + "learning_rate": 2.473657329529001e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.9375, + "logps/chosen": -675.25, + "logps/rejected": -481.0, + "loss": 0.3877, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1826171875, + "rewards/margins": 7.828125, + "rewards/rejected": -5.65234375, + "step": 3838 + }, + { + "epoch": 0.7617441341336376, + "grad_norm": 31.407325935570437, + "learning_rate": 2.471351483240674e-07, + "logits/chosen": 4.4140625, + "logits/rejected": 4.28125, + "logps/chosen": -982.0, + "logps/rejected": -694.0, + "loss": 0.251, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0234375, + "rewards/margins": 9.203125, + "rewards/rejected": -6.171875, + "step": 3839 + }, + { + "epoch": 0.7619425566744382, + "grad_norm": 31.225437746149826, + "learning_rate": 2.469047089741242e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 3.96484375, + "logps/chosen": -896.0, + "logps/rejected": -861.5, + "loss": 0.3928, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.431640625, + "rewards/margins": 9.015625, + "rewards/rejected": -6.59765625, + "step": 3840 + }, + { + "epoch": 0.7621409792152388, + "grad_norm": 42.78319984001742, + "learning_rate": 2.46674415013608e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.16015625, + "logps/chosen": -1180.0, + "logps/rejected": -920.0, + "loss": 0.3874, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.25, + "rewards/margins": 9.0859375, + "rewards/rejected": -6.84375, + "step": 3841 + }, + { + "epoch": 0.7623394017560395, + "grad_norm": 31.587025689357326, + "learning_rate": 2.464442665529872e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 4.04296875, + "logps/chosen": -850.0, + "logps/rejected": -729.5, + "loss": 0.4584, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.4892578125, + "rewards/margins": 6.6015625, + "rewards/rejected": -5.109375, + "step": 3842 + }, + { + "epoch": 0.7625378242968401, + "grad_norm": 31.485975243280038, + "learning_rate": 2.4621426370265924e-07, + "logits/chosen": 4.59375, + "logits/rejected": 4.1484375, + "logps/chosen": -792.0, + "logps/rejected": -503.0, + "loss": 0.3343, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8984375, + "rewards/margins": 8.359375, + "rewards/rejected": -5.46484375, + "step": 3843 + }, + { + "epoch": 0.7627362468376407, + "grad_norm": 31.032078320654996, + "learning_rate": 2.4598440657295286e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 3.953125, + "logps/chosen": -1454.0, + "logps/rejected": -867.0, + "loss": 0.4417, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.044921875, + "rewards/margins": 7.1953125, + "rewards/rejected": -4.16015625, + "step": 3844 + }, + { + "epoch": 0.7629346693784413, + "grad_norm": 27.96747932005485, + "learning_rate": 2.4575469527412646e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 4.07421875, + "logps/chosen": -974.0, + "logps/rejected": -786.0, + "loss": 0.1709, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.44140625, + "rewards/margins": 10.78125, + "rewards/rejected": -7.3203125, + "step": 3845 + }, + { + "epoch": 0.7631330919192421, + "grad_norm": 28.40050757744464, + "learning_rate": 2.455251299163682e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 3.82421875, + "logps/chosen": -734.0, + "logps/rejected": -713.0, + "loss": 0.4491, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.63427734375, + "rewards/margins": 6.71875, + "rewards/rejected": -5.0859375, + "step": 3846 + }, + { + "epoch": 0.7633315144600427, + "grad_norm": 31.776690364415742, + "learning_rate": 2.452957106097967e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 3.953125, + "logps/chosen": -967.0, + "logps/rejected": -539.5, + "loss": 0.4933, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.87109375, + "rewards/margins": 6.92578125, + "rewards/rejected": -4.0546875, + "step": 3847 + }, + { + "epoch": 0.7635299370008433, + "grad_norm": 46.63074864894157, + "learning_rate": 2.450664374644602e-07, + "logits/chosen": 3.58203125, + "logits/rejected": 3.56640625, + "logps/chosen": -820.0, + "logps/rejected": -740.0, + "loss": 0.4963, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.794921875, + "rewards/margins": 6.5390625, + "rewards/rejected": -4.734375, + "step": 3848 + }, + { + "epoch": 0.763728359541644, + "grad_norm": 36.35540314208487, + "learning_rate": 2.448373105903373e-07, + "logits/chosen": 4.078125, + "logits/rejected": 4.078125, + "logps/chosen": -658.0, + "logps/rejected": -721.0, + "loss": 0.3469, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.197021484375, + "rewards/margins": 6.5859375, + "rewards/rejected": -4.3828125, + "step": 3849 + }, + { + "epoch": 0.7639267820824446, + "grad_norm": 33.93615449250671, + "learning_rate": 2.446083300973354e-07, + "logits/chosen": 4.3515625, + "logits/rejected": 4.2109375, + "logps/chosen": -1023.0, + "logps/rejected": -818.0, + "loss": 0.3687, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.025390625, + "rewards/margins": 6.5625, + "rewards/rejected": -4.5390625, + "step": 3850 + }, + { + "epoch": 0.7641252046232452, + "grad_norm": 31.675950196123775, + "learning_rate": 2.443794960952931e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 4.11328125, + "logps/chosen": -856.5, + "logps/rejected": -735.5, + "loss": 0.4459, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.0830078125, + "rewards/margins": 7.8984375, + "rewards/rejected": -5.8046875, + "step": 3851 + }, + { + "epoch": 0.7643236271640458, + "grad_norm": 31.037988498966854, + "learning_rate": 2.4415080869397763e-07, + "logits/chosen": 3.6953125, + "logits/rejected": 4.25390625, + "logps/chosen": -755.0, + "logps/rejected": -1047.0, + "loss": 0.4855, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.552734375, + "rewards/margins": 6.6796875, + "rewards/rejected": -5.111328125, + "step": 3852 + }, + { + "epoch": 0.7645220497048465, + "grad_norm": 35.37701536918713, + "learning_rate": 2.4392226800308616e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.8828125, + "logps/chosen": -1019.0, + "logps/rejected": -739.5, + "loss": 0.4036, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4453125, + "rewards/margins": 8.0078125, + "rewards/rejected": -5.5703125, + "step": 3853 + }, + { + "epoch": 0.7647204722456471, + "grad_norm": 29.564289217300207, + "learning_rate": 2.4369387413224615e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 3.94140625, + "logps/chosen": -1094.0, + "logps/rejected": -751.5, + "loss": 0.4136, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.146484375, + "rewards/margins": 8.1875, + "rewards/rejected": -5.048828125, + "step": 3854 + }, + { + "epoch": 0.7649188947864477, + "grad_norm": 37.665494190227214, + "learning_rate": 2.434656271910134e-07, + "logits/chosen": 4.5546875, + "logits/rejected": 4.52734375, + "logps/chosen": -1096.0, + "logps/rejected": -1501.0, + "loss": 0.3121, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.85791015625, + "rewards/margins": 10.953125, + "rewards/rejected": -9.0859375, + "step": 3855 + }, + { + "epoch": 0.7651173173272484, + "grad_norm": 41.63354610206746, + "learning_rate": 2.4323752728887437e-07, + "logits/chosen": 3.7734375, + "logits/rejected": 3.80859375, + "logps/chosen": -746.0, + "logps/rejected": -520.0, + "loss": 0.4305, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.212890625, + "rewards/margins": 6.625, + "rewards/rejected": -4.421875, + "step": 3856 + }, + { + "epoch": 0.765315739868049, + "grad_norm": 42.13238298893958, + "learning_rate": 2.4300957453524445e-07, + "logits/chosen": 3.921875, + "logits/rejected": 3.95703125, + "logps/chosen": -818.0, + "logps/rejected": -1008.0, + "loss": 0.4007, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.138671875, + "rewards/margins": 7.46875, + "rewards/rejected": -5.33984375, + "step": 3857 + }, + { + "epoch": 0.7655141624088496, + "grad_norm": 32.009885683492726, + "learning_rate": 2.4278176903946847e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 3.89453125, + "logps/chosen": -1093.0, + "logps/rejected": -617.0, + "loss": 0.2659, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.140625, + "rewards/margins": 9.1171875, + "rewards/rejected": -5.96875, + "step": 3858 + }, + { + "epoch": 0.7657125849496503, + "grad_norm": 25.769077980016014, + "learning_rate": 2.4255411091082055e-07, + "logits/chosen": 4.25, + "logits/rejected": 3.68359375, + "logps/chosen": -1083.0, + "logps/rejected": -587.0, + "loss": 0.361, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.505859375, + "rewards/margins": 9.4140625, + "rewards/rejected": -6.91015625, + "step": 3859 + }, + { + "epoch": 0.7659110074904509, + "grad_norm": 24.876678743690935, + "learning_rate": 2.4232660025850437e-07, + "logits/chosen": 4.4140625, + "logits/rejected": 4.6796875, + "logps/chosen": -870.0, + "logps/rejected": -1929.5, + "loss": 0.3976, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.20703125, + "rewards/margins": 10.59375, + "rewards/rejected": -8.3828125, + "step": 3860 + }, + { + "epoch": 0.7661094300312515, + "grad_norm": 36.49628482100617, + "learning_rate": 2.4209923719165287e-07, + "logits/chosen": 3.6015625, + "logits/rejected": 3.66015625, + "logps/chosen": -946.0, + "logps/rejected": -901.0, + "loss": 0.3919, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.341796875, + "rewards/margins": 8.0234375, + "rewards/rejected": -5.671875, + "step": 3861 + }, + { + "epoch": 0.7663078525720521, + "grad_norm": 30.494752497297274, + "learning_rate": 2.4187202181932794e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 3.578125, + "logps/chosen": -966.0, + "logps/rejected": -571.5, + "loss": 0.3545, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.51171875, + "rewards/margins": 6.6875, + "rewards/rejected": -4.16796875, + "step": 3862 + }, + { + "epoch": 0.7665062751128529, + "grad_norm": 36.448146082756374, + "learning_rate": 2.416449542505208e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 3.734375, + "logps/chosen": -798.0, + "logps/rejected": -1725.0, + "loss": 0.4158, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8203125, + "rewards/margins": 9.640625, + "rewards/rejected": -6.830078125, + "step": 3863 + }, + { + "epoch": 0.7667046976536535, + "grad_norm": 28.081094472032806, + "learning_rate": 2.414180345941517e-07, + "logits/chosen": 3.984375, + "logits/rejected": 4.00390625, + "logps/chosen": -609.5, + "logps/rejected": -533.5, + "loss": 0.4118, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.36328125, + "rewards/margins": 7.4296875, + "rewards/rejected": -5.0625, + "step": 3864 + }, + { + "epoch": 0.7669031201944541, + "grad_norm": 24.83967707095064, + "learning_rate": 2.411912629590699e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 3.8203125, + "logps/chosen": -989.0, + "logps/rejected": -880.0, + "loss": 0.2138, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.41015625, + "rewards/margins": 13.703125, + "rewards/rejected": -10.2890625, + "step": 3865 + }, + { + "epoch": 0.7671015427352548, + "grad_norm": 31.146539968477143, + "learning_rate": 2.409646394540539e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.265625, + "logps/chosen": -1211.0, + "logps/rejected": -1825.0, + "loss": 0.2465, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.93359375, + "rewards/margins": 14.28125, + "rewards/rejected": -11.375, + "step": 3866 + }, + { + "epoch": 0.7672999652760554, + "grad_norm": 27.06762020074815, + "learning_rate": 2.407381641878109e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.80859375, + "logps/chosen": -708.5, + "logps/rejected": -541.5, + "loss": 0.4033, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.8642578125, + "rewards/margins": 7.34375, + "rewards/rejected": -5.5, + "step": 3867 + }, + { + "epoch": 0.767498387816856, + "grad_norm": 32.858264484917676, + "learning_rate": 2.4051183726897687e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.34765625, + "logps/chosen": -1226.0, + "logps/rejected": -888.0, + "loss": 0.4304, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.970703125, + "rewards/margins": 8.796875, + "rewards/rejected": -5.8359375, + "step": 3868 + }, + { + "epoch": 0.7676968103576566, + "grad_norm": 32.86632146817834, + "learning_rate": 2.402856588061171e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 3.69921875, + "logps/chosen": -1037.0, + "logps/rejected": -606.0, + "loss": 0.4108, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.830078125, + "rewards/margins": 7.703125, + "rewards/rejected": -5.88671875, + "step": 3869 + }, + { + "epoch": 0.7678952328984573, + "grad_norm": 27.890793513442592, + "learning_rate": 2.4005962890772526e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 4.16796875, + "logps/chosen": -853.0, + "logps/rejected": -878.5, + "loss": 0.3267, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.05859375, + "rewards/margins": 9.140625, + "rewards/rejected": -6.1015625, + "step": 3870 + }, + { + "epoch": 0.7680936554392579, + "grad_norm": 24.898357237236358, + "learning_rate": 2.398337476822238e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 3.92578125, + "logps/chosen": -755.0, + "logps/rejected": -1171.5, + "loss": 0.4695, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.4912109375, + "rewards/margins": 8.7578125, + "rewards/rejected": -7.25390625, + "step": 3871 + }, + { + "epoch": 0.7682920779800585, + "grad_norm": 34.356318995441974, + "learning_rate": 2.396080152379641e-07, + "logits/chosen": 4.4140625, + "logits/rejected": 4.4375, + "logps/chosen": -1024.0, + "logps/rejected": -612.5, + "loss": 0.4416, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.962890625, + "rewards/margins": 6.9765625, + "rewards/rejected": -5.0078125, + "step": 3872 + }, + { + "epoch": 0.7684905005208592, + "grad_norm": 32.69222098064546, + "learning_rate": 2.393824316832258e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.375, + "logps/chosen": -1507.0, + "logps/rejected": -1883.0, + "loss": 0.3664, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8046875, + "rewards/margins": 13.2890625, + "rewards/rejected": -14.0859375, + "step": 3873 + }, + { + "epoch": 0.7686889230616598, + "grad_norm": 30.271665593513834, + "learning_rate": 2.391569971262177e-07, + "logits/chosen": 4.3984375, + "logits/rejected": 4.34375, + "logps/chosen": -913.0, + "logps/rejected": -759.0, + "loss": 0.3986, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.119140625, + "rewards/margins": 8.7265625, + "rewards/rejected": -5.59375, + "step": 3874 + }, + { + "epoch": 0.7688873456024604, + "grad_norm": 27.170174477025714, + "learning_rate": 2.3893171167507615e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 3.53515625, + "logps/chosen": -1408.0, + "logps/rejected": -693.0, + "loss": 0.2622, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.5, + "rewards/margins": 9.734375, + "rewards/rejected": -6.234375, + "step": 3875 + }, + { + "epoch": 0.7690857681432611, + "grad_norm": 29.30026171512795, + "learning_rate": 2.3870657543786713e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.05078125, + "logps/chosen": -1378.0, + "logps/rejected": -827.0, + "loss": 0.3362, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.90234375, + "rewards/margins": 9.6640625, + "rewards/rejected": -5.765625, + "step": 3876 + }, + { + "epoch": 0.7692841906840617, + "grad_norm": 22.000862913147643, + "learning_rate": 2.384815885225842e-07, + "logits/chosen": 3.78125, + "logits/rejected": 3.5390625, + "logps/chosen": -1175.0, + "logps/rejected": -704.0, + "loss": 0.1819, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.56640625, + "rewards/margins": 10.65625, + "rewards/rejected": -7.09375, + "step": 3877 + }, + { + "epoch": 0.7694826132248623, + "grad_norm": 31.599173055327295, + "learning_rate": 2.382567510371496e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.9453125, + "logps/chosen": -1123.0, + "logps/rejected": -1349.0, + "loss": 0.5434, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.314453125, + "rewards/margins": 8.6328125, + "rewards/rejected": -6.3125, + "step": 3878 + }, + { + "epoch": 0.7696810357656629, + "grad_norm": 29.259957069603992, + "learning_rate": 2.3803206308941421e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.19140625, + "logps/chosen": -898.0, + "logps/rejected": -569.0, + "loss": 0.3686, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.298828125, + "rewards/margins": 6.671875, + "rewards/rejected": -4.375, + "step": 3879 + }, + { + "epoch": 0.7698794583064636, + "grad_norm": 33.70394168488789, + "learning_rate": 2.3780752478715627e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.84765625, + "logps/chosen": -1148.0, + "logps/rejected": -834.0, + "loss": 0.4283, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.876953125, + "rewards/margins": 7.984375, + "rewards/rejected": -5.11328125, + "step": 3880 + }, + { + "epoch": 0.7700778808472643, + "grad_norm": 30.85648652346791, + "learning_rate": 2.3758313623808331e-07, + "logits/chosen": 3.7421875, + "logits/rejected": 4.17578125, + "logps/chosen": -780.5, + "logps/rejected": -593.5, + "loss": 0.4368, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.134765625, + "rewards/margins": 6.59375, + "rewards/rejected": -4.4609375, + "step": 3881 + }, + { + "epoch": 0.7702763033880649, + "grad_norm": 25.090061133852824, + "learning_rate": 2.3735889754983034e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.5, + "logps/chosen": -961.0, + "logps/rejected": -546.5, + "loss": 0.3677, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.62109375, + "rewards/margins": 8.453125, + "rewards/rejected": -5.8203125, + "step": 3882 + }, + { + "epoch": 0.7704747259288656, + "grad_norm": 32.74436350397554, + "learning_rate": 2.3713480882996068e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.00390625, + "logps/chosen": -1163.0, + "logps/rejected": -805.0, + "loss": 0.3769, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.59375, + "rewards/margins": 7.1875, + "rewards/rejected": -4.59765625, + "step": 3883 + }, + { + "epoch": 0.7706731484696662, + "grad_norm": 34.286373161122754, + "learning_rate": 2.3691087018596566e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 3.87109375, + "logps/chosen": -999.0, + "logps/rejected": -805.0, + "loss": 0.4849, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.80078125, + "rewards/margins": 6.4375, + "rewards/rejected": -3.640625, + "step": 3884 + }, + { + "epoch": 0.7708715710104668, + "grad_norm": 38.28897129535513, + "learning_rate": 2.366870817252647e-07, + "logits/chosen": 3.73828125, + "logits/rejected": 3.8046875, + "logps/chosen": -1066.0, + "logps/rejected": -755.0, + "loss": 0.4402, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.58740234375, + "rewards/margins": 7.40625, + "rewards/rejected": -5.8359375, + "step": 3885 + }, + { + "epoch": 0.7710699935512674, + "grad_norm": 35.376927794372975, + "learning_rate": 2.3646344355520536e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 3.83203125, + "logps/chosen": -922.0, + "logps/rejected": -589.5, + "loss": 0.4054, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.361328125, + "rewards/margins": 7.21875, + "rewards/rejected": -4.8515625, + "step": 3886 + }, + { + "epoch": 0.7712684160920681, + "grad_norm": 47.441163120116556, + "learning_rate": 2.3623995578306245e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.1640625, + "logps/chosen": -1192.0, + "logps/rejected": -674.0, + "loss": 0.4567, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.2841796875, + "rewards/margins": 6.65625, + "rewards/rejected": -4.3564453125, + "step": 3887 + }, + { + "epoch": 0.7714668386328687, + "grad_norm": 22.463130298542147, + "learning_rate": 2.3601661851603953e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.33203125, + "logps/chosen": -1149.0, + "logps/rejected": -758.0, + "loss": 0.2706, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.609375, + "rewards/margins": 8.5859375, + "rewards/rejected": -4.98828125, + "step": 3888 + }, + { + "epoch": 0.7716652611736693, + "grad_norm": 40.23082265124485, + "learning_rate": 2.3579343186126726e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.09765625, + "logps/chosen": -1206.0, + "logps/rejected": -802.5, + "loss": 0.4168, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.046875, + "rewards/margins": 7.1171875, + "rewards/rejected": -5.078125, + "step": 3889 + }, + { + "epoch": 0.77186368371447, + "grad_norm": 23.33971175296349, + "learning_rate": 2.3557039592580443e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.203125, + "logps/chosen": -914.0, + "logps/rejected": -641.0, + "loss": 0.3531, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.19140625, + "rewards/margins": 8.8203125, + "rewards/rejected": -5.62109375, + "step": 3890 + }, + { + "epoch": 0.7720621062552706, + "grad_norm": 34.217713191696, + "learning_rate": 2.353475108166374e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.2421875, + "logps/chosen": -954.5, + "logps/rejected": -663.5, + "loss": 0.6174, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.9625244140625, + "rewards/margins": 3.923828125, + "rewards/rejected": -2.95703125, + "step": 3891 + }, + { + "epoch": 0.7722605287960712, + "grad_norm": 30.417953063167946, + "learning_rate": 2.3512477664068002e-07, + "logits/chosen": 3.75390625, + "logits/rejected": 3.59765625, + "logps/chosen": -640.0, + "logps/rejected": -433.0, + "loss": 0.4948, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.75634765625, + "rewards/margins": 4.875, + "rewards/rejected": -3.125, + "step": 3892 + }, + { + "epoch": 0.7724589513368719, + "grad_norm": 27.8845586196693, + "learning_rate": 2.349021935047742e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 4.01171875, + "logps/chosen": -1005.0, + "logps/rejected": -629.5, + "loss": 0.3543, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5703125, + "rewards/margins": 7.3515625, + "rewards/rejected": -4.78125, + "step": 3893 + }, + { + "epoch": 0.7726573738776725, + "grad_norm": 30.251932012071727, + "learning_rate": 2.3467976151568892e-07, + "logits/chosen": 4.546875, + "logits/rejected": 4.5546875, + "logps/chosen": -849.5, + "logps/rejected": -780.0, + "loss": 0.3881, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.83984375, + "rewards/margins": 8.2421875, + "rewards/rejected": -5.3984375, + "step": 3894 + }, + { + "epoch": 0.7728557964184731, + "grad_norm": 40.947876464012126, + "learning_rate": 2.3445748078012104e-07, + "logits/chosen": 4.46875, + "logits/rejected": 4.39453125, + "logps/chosen": -987.0, + "logps/rejected": -955.0, + "loss": 0.4204, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.55078125, + "rewards/margins": 7.171875, + "rewards/rejected": -4.625, + "step": 3895 + }, + { + "epoch": 0.7730542189592737, + "grad_norm": 31.729098178034768, + "learning_rate": 2.342353514046945e-07, + "logits/chosen": 3.76171875, + "logits/rejected": 3.7109375, + "logps/chosen": -963.0, + "logps/rejected": -771.0, + "loss": 0.4099, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.994140625, + "rewards/margins": 8.2578125, + "rewards/rejected": -5.265625, + "step": 3896 + }, + { + "epoch": 0.7732526415000744, + "grad_norm": 29.618947225070432, + "learning_rate": 2.3401337349596087e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 3.69921875, + "logps/chosen": -1095.0, + "logps/rejected": -827.0, + "loss": 0.3857, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4921875, + "rewards/margins": 7.046875, + "rewards/rejected": -4.55859375, + "step": 3897 + }, + { + "epoch": 0.773451064040875, + "grad_norm": 45.64913649249548, + "learning_rate": 2.337915471603989e-07, + "logits/chosen": 3.68359375, + "logits/rejected": 3.84375, + "logps/chosen": -854.0, + "logps/rejected": -618.5, + "loss": 0.5145, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.2685546875, + "rewards/margins": 5.67578125, + "rewards/rejected": -4.40625, + "step": 3898 + }, + { + "epoch": 0.7736494865816756, + "grad_norm": 39.436401596416225, + "learning_rate": 2.335698725044151e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 3.671875, + "logps/chosen": -1032.0, + "logps/rejected": -784.0, + "loss": 0.4355, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.31640625, + "rewards/margins": 7.6640625, + "rewards/rejected": -5.33984375, + "step": 3899 + }, + { + "epoch": 0.7738479091224764, + "grad_norm": 34.47716174210852, + "learning_rate": 2.3334834963434237e-07, + "logits/chosen": 4.38671875, + "logits/rejected": 3.8984375, + "logps/chosen": -1169.0, + "logps/rejected": -693.5, + "loss": 0.4218, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.412109375, + "rewards/margins": 8.8203125, + "rewards/rejected": -6.4140625, + "step": 3900 + }, + { + "epoch": 0.774046331663277, + "grad_norm": 29.94500122721373, + "learning_rate": 2.331269786564417e-07, + "logits/chosen": 3.71484375, + "logits/rejected": 3.5703125, + "logps/chosen": -1042.0, + "logps/rejected": -709.0, + "loss": 0.2984, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3984375, + "rewards/margins": 8.5703125, + "rewards/rejected": -6.171875, + "step": 3901 + }, + { + "epoch": 0.7742447542040776, + "grad_norm": 29.88973047976337, + "learning_rate": 2.3290575967690067e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.07421875, + "logps/chosen": -1058.0, + "logps/rejected": -727.5, + "loss": 0.5211, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.2919921875, + "rewards/margins": 7.1953125, + "rewards/rejected": -4.921875, + "step": 3902 + }, + { + "epoch": 0.7744431767448782, + "grad_norm": 26.281608600289807, + "learning_rate": 2.3268469280183394e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.51953125, + "logps/chosen": -1059.0, + "logps/rejected": -1010.0, + "loss": 0.4198, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.3671875, + "rewards/margins": 9.09375, + "rewards/rejected": -5.734375, + "step": 3903 + }, + { + "epoch": 0.7746415992856789, + "grad_norm": 40.8551981388725, + "learning_rate": 2.324637781372838e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.19140625, + "logps/chosen": -821.0, + "logps/rejected": -1728.5, + "loss": 0.4227, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.099609375, + "rewards/margins": 11.53125, + "rewards/rejected": -9.4453125, + "step": 3904 + }, + { + "epoch": 0.7748400218264795, + "grad_norm": 26.82987665543156, + "learning_rate": 2.3224301578921847e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.0703125, + "logps/chosen": -1173.0, + "logps/rejected": -755.5, + "loss": 0.3408, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.03515625, + "rewards/margins": 8.03125, + "rewards/rejected": -4.984375, + "step": 3905 + }, + { + "epoch": 0.7750384443672801, + "grad_norm": 31.378526342472615, + "learning_rate": 2.3202240586353442e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.21875, + "logps/chosen": -1063.0, + "logps/rejected": -691.5, + "loss": 0.2735, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.00390625, + "rewards/margins": 9.171875, + "rewards/rejected": -6.15234375, + "step": 3906 + }, + { + "epoch": 0.7752368669080808, + "grad_norm": 32.985183016661566, + "learning_rate": 2.3180194846605364e-07, + "logits/chosen": 4.078125, + "logits/rejected": 3.94921875, + "logps/chosen": -1004.0, + "logps/rejected": -663.0, + "loss": 0.6008, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.6025390625, + "rewards/margins": 5.111328125, + "rewards/rejected": -3.509765625, + "step": 3907 + }, + { + "epoch": 0.7754352894488814, + "grad_norm": 20.771547554947073, + "learning_rate": 2.3158164370252594e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 3.98046875, + "logps/chosen": -1099.0, + "logps/rejected": -1296.0, + "loss": 0.4513, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.189453125, + "rewards/margins": 8.796875, + "rewards/rejected": -6.625, + "step": 3908 + }, + { + "epoch": 0.775633711989682, + "grad_norm": 46.54050489229726, + "learning_rate": 2.3136149167862788e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.08203125, + "logps/chosen": -938.0, + "logps/rejected": -597.5, + "loss": 0.41, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.20458984375, + "rewards/margins": 7.625, + "rewards/rejected": -5.42578125, + "step": 3909 + }, + { + "epoch": 0.7758321345304826, + "grad_norm": 36.79255132997872, + "learning_rate": 2.3114149249996191e-07, + "logits/chosen": 3.765625, + "logits/rejected": 3.51171875, + "logps/chosen": -768.0, + "logps/rejected": -588.5, + "loss": 0.5114, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.291015625, + "rewards/margins": 6.0625, + "rewards/rejected": -3.7705078125, + "step": 3910 + }, + { + "epoch": 0.7760305570712833, + "grad_norm": 28.151239711260093, + "learning_rate": 2.3092164627205835e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 3.94921875, + "logps/chosen": -857.5, + "logps/rejected": -574.5, + "loss": 0.4514, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.967041015625, + "rewards/margins": 5.88671875, + "rewards/rejected": -3.9306640625, + "step": 3911 + }, + { + "epoch": 0.7762289796120839, + "grad_norm": 27.2179109565197, + "learning_rate": 2.3070195310037286e-07, + "logits/chosen": 4.6015625, + "logits/rejected": 4.07421875, + "logps/chosen": -931.0, + "logps/rejected": -634.0, + "loss": 0.359, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.55859375, + "rewards/margins": 7.828125, + "rewards/rejected": -5.26171875, + "step": 3912 + }, + { + "epoch": 0.7764274021528845, + "grad_norm": 25.946327817199553, + "learning_rate": 2.3048241309028887e-07, + "logits/chosen": 4.046875, + "logits/rejected": 3.97265625, + "logps/chosen": -1902.0, + "logps/rejected": -757.5, + "loss": 0.2296, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.5078125, + "rewards/margins": 7.6171875, + "rewards/rejected": -6.109375, + "step": 3913 + }, + { + "epoch": 0.7766258246936852, + "grad_norm": 38.143503958198075, + "learning_rate": 2.302630263471156e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.265625, + "logps/chosen": -828.0, + "logps/rejected": -683.5, + "loss": 0.4168, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.845703125, + "rewards/margins": 7.9375, + "rewards/rejected": -6.1015625, + "step": 3914 + }, + { + "epoch": 0.7768242472344858, + "grad_norm": 35.449508507193265, + "learning_rate": 2.3004379297608893e-07, + "logits/chosen": 4.38671875, + "logits/rejected": 4.6171875, + "logps/chosen": -971.0, + "logps/rejected": -900.0, + "loss": 0.5193, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.625, + "rewards/margins": 12.5, + "rewards/rejected": -9.84765625, + "step": 3915 + }, + { + "epoch": 0.7770226697752864, + "grad_norm": 26.915885969971423, + "learning_rate": 2.2982471308237152e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.640625, + "logps/chosen": -896.0, + "logps/rejected": -737.0, + "loss": 0.3579, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.240234375, + "rewards/margins": 7.5859375, + "rewards/rejected": -5.35546875, + "step": 3916 + }, + { + "epoch": 0.7772210923160872, + "grad_norm": 30.716795058430677, + "learning_rate": 2.2960578677105167e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 4.015625, + "logps/chosen": -864.0, + "logps/rejected": -809.0, + "loss": 0.5075, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.69140625, + "rewards/margins": 6.8515625, + "rewards/rejected": -5.16015625, + "step": 3917 + }, + { + "epoch": 0.7774195148568878, + "grad_norm": 28.651990721404214, + "learning_rate": 2.2938701414714472e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 3.6875, + "logps/chosen": -1416.0, + "logps/rejected": -1204.0, + "loss": 0.3106, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.34375, + "rewards/margins": 8.859375, + "rewards/rejected": -5.515625, + "step": 3918 + }, + { + "epoch": 0.7776179373976884, + "grad_norm": 27.56413249447765, + "learning_rate": 2.29168395315592e-07, + "logits/chosen": 4.2421875, + "logits/rejected": 4.13671875, + "logps/chosen": -1076.5, + "logps/rejected": -835.5, + "loss": 0.3746, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5625, + "rewards/margins": 9.4609375, + "rewards/rejected": -6.90625, + "step": 3919 + }, + { + "epoch": 0.777816359938489, + "grad_norm": 27.92122870210507, + "learning_rate": 2.2894993038126097e-07, + "logits/chosen": 4.44140625, + "logits/rejected": 4.3359375, + "logps/chosen": -1178.0, + "logps/rejected": -813.0, + "loss": 0.3125, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.34765625, + "rewards/margins": 10.0234375, + "rewards/rejected": -6.6875, + "step": 3920 + }, + { + "epoch": 0.7780147824792897, + "grad_norm": 20.717481966004343, + "learning_rate": 2.287316194489455e-07, + "logits/chosen": 4.4921875, + "logits/rejected": 4.21484375, + "logps/chosen": -938.0, + "logps/rejected": -978.0, + "loss": 0.282, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.634765625, + "rewards/margins": 10.2890625, + "rewards/rejected": -7.6640625, + "step": 3921 + }, + { + "epoch": 0.7782132050200903, + "grad_norm": 23.419501139196033, + "learning_rate": 2.2851346262336536e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.25390625, + "logps/chosen": -1352.0, + "logps/rejected": -1100.0, + "loss": 0.2894, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.326171875, + "rewards/margins": 11.6875, + "rewards/rejected": -8.390625, + "step": 3922 + }, + { + "epoch": 0.7784116275608909, + "grad_norm": 39.35637649041857, + "learning_rate": 2.282954600091664e-07, + "logits/chosen": 3.703125, + "logits/rejected": 3.74609375, + "logps/chosen": -975.0, + "logps/rejected": -732.5, + "loss": 0.4477, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.681640625, + "rewards/margins": 7.1328125, + "rewards/rejected": -4.4609375, + "step": 3923 + }, + { + "epoch": 0.7786100501016916, + "grad_norm": 31.780555293063287, + "learning_rate": 2.2807761171092092e-07, + "logits/chosen": 4.5078125, + "logits/rejected": 4.30859375, + "logps/chosen": -874.5, + "logps/rejected": -569.5, + "loss": 0.4625, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.61328125, + "rewards/margins": 6.90625, + "rewards/rejected": -4.27734375, + "step": 3924 + }, + { + "epoch": 0.7788084726424922, + "grad_norm": 27.005066760933396, + "learning_rate": 2.278599178331267e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.2265625, + "logps/chosen": -919.0, + "logps/rejected": -740.0, + "loss": 0.3631, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.943359375, + "rewards/margins": 8.7265625, + "rewards/rejected": -5.77734375, + "step": 3925 + }, + { + "epoch": 0.7790068951832928, + "grad_norm": 31.868759006830427, + "learning_rate": 2.2764237848020764e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.2421875, + "logps/chosen": -1078.0, + "logps/rejected": -709.5, + "loss": 0.3299, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.17529296875, + "rewards/margins": 8.7109375, + "rewards/rejected": -6.5390625, + "step": 3926 + }, + { + "epoch": 0.7792053177240934, + "grad_norm": 24.553368786381224, + "learning_rate": 2.2742499375651346e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 3.5625, + "logps/chosen": -940.0, + "logps/rejected": -721.0, + "loss": 0.3491, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.3203125, + "rewards/margins": 7.4921875, + "rewards/rejected": -6.1796875, + "step": 3927 + }, + { + "epoch": 0.7794037402648941, + "grad_norm": 30.181722115641204, + "learning_rate": 2.2720776376631973e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.8828125, + "logps/chosen": -894.0, + "logps/rejected": -773.0, + "loss": 0.3544, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.703125, + "rewards/margins": 7.8515625, + "rewards/rejected": -5.15625, + "step": 3928 + }, + { + "epoch": 0.7796021628056947, + "grad_norm": 30.71602110998278, + "learning_rate": 2.2699068861382805e-07, + "logits/chosen": 4.53515625, + "logits/rejected": 4.44921875, + "logps/chosen": -841.0, + "logps/rejected": -1748.5, + "loss": 0.4623, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.9365234375, + "rewards/margins": 9.5546875, + "rewards/rejected": -7.61328125, + "step": 3929 + }, + { + "epoch": 0.7798005853464953, + "grad_norm": 31.507968337444463, + "learning_rate": 2.2677376840316513e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.1484375, + "logps/chosen": -878.5, + "logps/rejected": -568.0, + "loss": 0.3617, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.876953125, + "rewards/margins": 7.2265625, + "rewards/rejected": -4.36328125, + "step": 3930 + }, + { + "epoch": 0.779999007887296, + "grad_norm": 35.63756214519315, + "learning_rate": 2.2655700323838407e-07, + "logits/chosen": 3.44921875, + "logits/rejected": 3.58984375, + "logps/chosen": -1098.0, + "logps/rejected": -769.0, + "loss": 0.4341, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.60546875, + "rewards/margins": 7.421875, + "rewards/rejected": -4.8125, + "step": 3931 + }, + { + "epoch": 0.7801974304280966, + "grad_norm": 22.359966542563928, + "learning_rate": 2.2634039322346312e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.140625, + "logps/chosen": -738.0, + "logps/rejected": -553.5, + "loss": 0.3089, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.619140625, + "rewards/margins": 7.921875, + "rewards/rejected": -5.296875, + "step": 3932 + }, + { + "epoch": 0.7803958529688972, + "grad_norm": 32.57938455667699, + "learning_rate": 2.2612393846230635e-07, + "logits/chosen": 3.6953125, + "logits/rejected": 3.73046875, + "logps/chosen": -1613.0, + "logps/rejected": -718.0, + "loss": 0.4005, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.181640625, + "rewards/margins": 6.052734375, + "rewards/rejected": -4.8671875, + "step": 3933 + }, + { + "epoch": 0.780594275509698, + "grad_norm": 35.430461744770966, + "learning_rate": 2.2590763905874314e-07, + "logits/chosen": 4.28515625, + "logits/rejected": 3.96484375, + "logps/chosen": -1340.0, + "logps/rejected": -1240.5, + "loss": 0.4526, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.94775390625, + "rewards/margins": 10.5078125, + "rewards/rejected": -8.546875, + "step": 3934 + }, + { + "epoch": 0.7807926980504986, + "grad_norm": 26.389543083555544, + "learning_rate": 2.2569149511652842e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.34765625, + "logps/chosen": -941.0, + "logps/rejected": -611.5, + "loss": 0.4025, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.10546875, + "rewards/margins": 6.7734375, + "rewards/rejected": -3.66015625, + "step": 3935 + }, + { + "epoch": 0.7809911205912992, + "grad_norm": 28.748569819596472, + "learning_rate": 2.2547550673934296e-07, + "logits/chosen": 4.35546875, + "logits/rejected": 4.0625, + "logps/chosen": -775.0, + "logps/rejected": -522.5, + "loss": 0.417, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.57373046875, + "rewards/margins": 7.5703125, + "rewards/rejected": -5.98046875, + "step": 3936 + }, + { + "epoch": 0.7811895431320998, + "grad_norm": 24.610065507750583, + "learning_rate": 2.2525967403079206e-07, + "logits/chosen": 4.6953125, + "logits/rejected": 4.9296875, + "logps/chosen": -826.5, + "logps/rejected": -819.0, + "loss": 0.3845, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.94921875, + "rewards/margins": 8.65625, + "rewards/rejected": -5.70703125, + "step": 3937 + }, + { + "epoch": 0.7813879656729005, + "grad_norm": 31.621344195952403, + "learning_rate": 2.250439970944072e-07, + "logits/chosen": 4.25, + "logits/rejected": 4.3125, + "logps/chosen": -1033.5, + "logps/rejected": -800.0, + "loss": 0.4262, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.263671875, + "rewards/margins": 6.875, + "rewards/rejected": -4.62890625, + "step": 3938 + }, + { + "epoch": 0.7815863882137011, + "grad_norm": 32.56483338640158, + "learning_rate": 2.2482847603364464e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 3.671875, + "logps/chosen": -694.5, + "logps/rejected": -560.5, + "loss": 0.441, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.8046875, + "rewards/margins": 6.3984375, + "rewards/rejected": -4.59765625, + "step": 3939 + }, + { + "epoch": 0.7817848107545017, + "grad_norm": 31.32998347681843, + "learning_rate": 2.246131109518859e-07, + "logits/chosen": 3.80859375, + "logits/rejected": 3.70703125, + "logps/chosen": -729.0, + "logps/rejected": -716.0, + "loss": 0.4338, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.013671875, + "rewards/margins": 6.7890625, + "rewards/rejected": -4.765625, + "step": 3940 + }, + { + "epoch": 0.7819832332953024, + "grad_norm": 30.575994406104968, + "learning_rate": 2.2439790195243813e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 4.26953125, + "logps/chosen": -1375.0, + "logps/rejected": -822.0, + "loss": 0.4006, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.392578125, + "rewards/margins": 9.421875, + "rewards/rejected": -6.0078125, + "step": 3941 + }, + { + "epoch": 0.782181655836103, + "grad_norm": 33.937109813220175, + "learning_rate": 2.2418284913853282e-07, + "logits/chosen": 4.26171875, + "logits/rejected": 4.14453125, + "logps/chosen": -846.5, + "logps/rejected": -647.0, + "loss": 0.4177, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.732421875, + "rewards/margins": 5.9375, + "rewards/rejected": -4.20703125, + "step": 3942 + }, + { + "epoch": 0.7823800783769036, + "grad_norm": 34.8249661792849, + "learning_rate": 2.239679526133273e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.09765625, + "logps/chosen": -807.0, + "logps/rejected": -949.0, + "loss": 0.4679, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.806640625, + "rewards/margins": 7.828125, + "rewards/rejected": -6.03515625, + "step": 3943 + }, + { + "epoch": 0.7825785009177042, + "grad_norm": 31.53212867058561, + "learning_rate": 2.2375321247990354e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.26953125, + "logps/chosen": -988.0, + "logps/rejected": -752.0, + "loss": 0.3804, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.333984375, + "rewards/margins": 7.3359375, + "rewards/rejected": -5.0, + "step": 3944 + }, + { + "epoch": 0.7827769234585049, + "grad_norm": 34.33542556156158, + "learning_rate": 2.2353862884126852e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.21484375, + "logps/chosen": -713.0, + "logps/rejected": -499.5, + "loss": 0.4695, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.962890625, + "rewards/margins": 5.98828125, + "rewards/rejected": -4.025390625, + "step": 3945 + }, + { + "epoch": 0.7829753459993055, + "grad_norm": 26.58247572259384, + "learning_rate": 2.2332420180035424e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 3.84765625, + "logps/chosen": -580.5, + "logps/rejected": -499.0, + "loss": 0.4712, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.015625, + "rewards/margins": 6.6484375, + "rewards/rejected": -4.6484375, + "step": 3946 + }, + { + "epoch": 0.7831737685401061, + "grad_norm": 38.4207454219358, + "learning_rate": 2.2310993146001738e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 3.51171875, + "logps/chosen": -1076.0, + "logps/rejected": -620.0, + "loss": 0.2988, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.05078125, + "rewards/margins": 8.5625, + "rewards/rejected": -5.53125, + "step": 3947 + }, + { + "epoch": 0.7833721910809068, + "grad_norm": 29.6677448702187, + "learning_rate": 2.2289581792303984e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.75, + "logps/chosen": -1009.0, + "logps/rejected": -808.0, + "loss": 0.3601, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8984375, + "rewards/margins": 9.515625, + "rewards/rejected": -6.625, + "step": 3948 + }, + { + "epoch": 0.7835706136217074, + "grad_norm": 30.885108156149386, + "learning_rate": 2.2268186129212807e-07, + "logits/chosen": 3.91015625, + "logits/rejected": 4.16796875, + "logps/chosen": -749.0, + "logps/rejected": -1850.0, + "loss": 0.466, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.26171875, + "rewards/margins": 10.1328125, + "rewards/rejected": -8.84765625, + "step": 3949 + }, + { + "epoch": 0.783769036162508, + "grad_norm": 30.467890499184918, + "learning_rate": 2.2246806166991316e-07, + "logits/chosen": 4.453125, + "logits/rejected": 4.5703125, + "logps/chosen": -837.0, + "logps/rejected": -680.0, + "loss": 0.4388, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.205078125, + "rewards/margins": 6.90625, + "rewards/rejected": -4.69921875, + "step": 3950 + }, + { + "epoch": 0.7839674587033088, + "grad_norm": 26.064181709236927, + "learning_rate": 2.222544191589511e-07, + "logits/chosen": 4.22265625, + "logits/rejected": 3.69921875, + "logps/chosen": -1152.0, + "logps/rejected": -676.0, + "loss": 0.3595, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.619140625, + "rewards/margins": 8.1640625, + "rewards/rejected": -5.5546875, + "step": 3951 + }, + { + "epoch": 0.7841658812441094, + "grad_norm": 24.517006714866373, + "learning_rate": 2.2204093386172224e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.04296875, + "logps/chosen": -846.0, + "logps/rejected": -1048.0, + "loss": 0.3742, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.453125, + "rewards/margins": 8.3046875, + "rewards/rejected": -5.84375, + "step": 3952 + }, + { + "epoch": 0.78436430378491, + "grad_norm": 29.435531087518964, + "learning_rate": 2.2182760588063182e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.44921875, + "logps/chosen": -989.0, + "logps/rejected": -712.0, + "loss": 0.4954, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.242919921875, + "rewards/margins": 7.16015625, + "rewards/rejected": -4.9140625, + "step": 3953 + }, + { + "epoch": 0.7845627263257106, + "grad_norm": 32.89641651155776, + "learning_rate": 2.2161443531800945e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.1484375, + "logps/chosen": -852.0, + "logps/rejected": -634.5, + "loss": 0.3144, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.083984375, + "rewards/margins": 8.5234375, + "rewards/rejected": -6.4375, + "step": 3954 + }, + { + "epoch": 0.7847611488665113, + "grad_norm": 32.56638646967808, + "learning_rate": 2.2140142227610914e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 4.421875, + "logps/chosen": -876.5, + "logps/rejected": -694.5, + "loss": 0.3551, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.69140625, + "rewards/margins": 8.4140625, + "rewards/rejected": -6.73046875, + "step": 3955 + }, + { + "epoch": 0.7849595714073119, + "grad_norm": 39.19786820628761, + "learning_rate": 2.2118856685710967e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.30859375, + "logps/chosen": -845.0, + "logps/rejected": -780.5, + "loss": 0.516, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.73046875, + "rewards/margins": 7.0546875, + "rewards/rejected": -4.3203125, + "step": 3956 + }, + { + "epoch": 0.7851579939481125, + "grad_norm": 28.704770120054288, + "learning_rate": 2.2097586916311388e-07, + "logits/chosen": 3.74609375, + "logits/rejected": 4.140625, + "logps/chosen": -1080.0, + "logps/rejected": -1101.0, + "loss": 0.3923, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.193359375, + "rewards/margins": 9.7265625, + "rewards/rejected": -7.53125, + "step": 3957 + }, + { + "epoch": 0.7853564164889132, + "grad_norm": 32.0342568749575, + "learning_rate": 2.2076332929614917e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 3.890625, + "logps/chosen": -855.0, + "logps/rejected": -780.0, + "loss": 0.383, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3046875, + "rewards/margins": 8.1171875, + "rewards/rejected": -5.80859375, + "step": 3958 + }, + { + "epoch": 0.7855548390297138, + "grad_norm": 28.02694986276812, + "learning_rate": 2.2055094735816708e-07, + "logits/chosen": 4.31640625, + "logits/rejected": 4.359375, + "logps/chosen": -1058.0, + "logps/rejected": -755.5, + "loss": 0.4126, + "rewards/accuracies": 0.84375, + "rewards/chosen": 4.08203125, + "rewards/margins": 8.8203125, + "rewards/rejected": -4.732421875, + "step": 3959 + }, + { + "epoch": 0.7857532615705144, + "grad_norm": 31.561493163395234, + "learning_rate": 2.2033872345104332e-07, + "logits/chosen": 4.4375, + "logits/rejected": 4.3125, + "logps/chosen": -963.0, + "logps/rejected": -1330.0, + "loss": 0.3599, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.896484375, + "rewards/margins": 11.21875, + "rewards/rejected": -7.3359375, + "step": 3960 + }, + { + "epoch": 0.785951684111315, + "grad_norm": 33.176889051359005, + "learning_rate": 2.2012665767657823e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.25, + "logps/chosen": -707.0, + "logps/rejected": -991.0, + "loss": 0.6006, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.37060546875, + "rewards/margins": 7.6796875, + "rewards/rejected": -5.3125, + "step": 3961 + }, + { + "epoch": 0.7861501066521157, + "grad_norm": 24.38844622392435, + "learning_rate": 2.199147501364957e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 4.234375, + "logps/chosen": -822.5, + "logps/rejected": -790.0, + "loss": 0.4061, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.451171875, + "rewards/margins": 7.890625, + "rewards/rejected": -5.42578125, + "step": 3962 + }, + { + "epoch": 0.7863485291929163, + "grad_norm": 34.675227474877225, + "learning_rate": 2.1970300093244414e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 3.8984375, + "logps/chosen": -1207.0, + "logps/rejected": -869.5, + "loss": 0.3047, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.93359375, + "rewards/margins": 10.234375, + "rewards/rejected": -7.2734375, + "step": 3963 + }, + { + "epoch": 0.7865469517337169, + "grad_norm": 37.42969332587391, + "learning_rate": 2.1949141016599595e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.16015625, + "logps/chosen": -933.0, + "logps/rejected": -608.0, + "loss": 0.3628, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.328125, + "rewards/margins": 7.2421875, + "rewards/rejected": -4.921875, + "step": 3964 + }, + { + "epoch": 0.7867453742745176, + "grad_norm": 45.81512579884726, + "learning_rate": 2.1927997793864722e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 3.8828125, + "logps/chosen": -941.5, + "logps/rejected": -736.0, + "loss": 0.3616, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.91796875, + "rewards/margins": 6.671875, + "rewards/rejected": -3.75390625, + "step": 3965 + }, + { + "epoch": 0.7869437968153182, + "grad_norm": 34.013186453089574, + "learning_rate": 2.1906870435181868e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 3.8046875, + "logps/chosen": -960.0, + "logps/rejected": -706.5, + "loss": 0.4287, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.15625, + "rewards/margins": 8.1015625, + "rewards/rejected": -4.93359375, + "step": 3966 + }, + { + "epoch": 0.7871422193561188, + "grad_norm": 29.463082365528344, + "learning_rate": 2.1885758950685396e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 4.0, + "logps/chosen": -979.5, + "logps/rejected": -602.5, + "loss": 0.3641, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.83203125, + "rewards/margins": 7.46875, + "rewards/rejected": -4.611328125, + "step": 3967 + }, + { + "epoch": 0.7873406418969194, + "grad_norm": 33.19264496732927, + "learning_rate": 2.1864663350502156e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 3.890625, + "logps/chosen": -944.0, + "logps/rejected": -1525.0, + "loss": 0.4249, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.955078125, + "rewards/margins": 10.9609375, + "rewards/rejected": -7.98046875, + "step": 3968 + }, + { + "epoch": 0.7875390644377201, + "grad_norm": 34.013751337858416, + "learning_rate": 2.1843583644751307e-07, + "logits/chosen": 4.515625, + "logits/rejected": 4.1875, + "logps/chosen": -967.0, + "logps/rejected": -599.5, + "loss": 0.3178, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3828125, + "rewards/margins": 7.3828125, + "rewards/rejected": -5.00390625, + "step": 3969 + }, + { + "epoch": 0.7877374869785208, + "grad_norm": 30.147536498279692, + "learning_rate": 2.1822519843544422e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.30859375, + "logps/chosen": -873.0, + "logps/rejected": -646.0, + "loss": 0.404, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5595703125, + "rewards/margins": 7.5234375, + "rewards/rejected": -4.9609375, + "step": 3970 + }, + { + "epoch": 0.7879359095193214, + "grad_norm": 31.93807252013677, + "learning_rate": 2.1801471956985422e-07, + "logits/chosen": 4.5234375, + "logits/rejected": 4.421875, + "logps/chosen": -816.0, + "logps/rejected": -754.0, + "loss": 0.4117, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.765625, + "rewards/margins": 8.5, + "rewards/rejected": -5.73828125, + "step": 3971 + }, + { + "epoch": 0.7881343320601221, + "grad_norm": 34.09707039343237, + "learning_rate": 2.1780439995170608e-07, + "logits/chosen": 4.36328125, + "logits/rejected": 4.4375, + "logps/chosen": -1118.0, + "logps/rejected": -840.5, + "loss": 0.3694, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.0078125, + "rewards/margins": 8.5234375, + "rewards/rejected": -5.51171875, + "step": 3972 + }, + { + "epoch": 0.7883327546009227, + "grad_norm": 33.771197104894085, + "learning_rate": 2.175942396818866e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 3.984375, + "logps/chosen": -1003.0, + "logps/rejected": -743.0, + "loss": 0.304, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.875, + "rewards/margins": 8.2421875, + "rewards/rejected": -5.3828125, + "step": 3973 + }, + { + "epoch": 0.7885311771417233, + "grad_norm": 42.160512348917706, + "learning_rate": 2.1738423886120545e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 3.640625, + "logps/chosen": -1326.0, + "logps/rejected": -739.0, + "loss": 0.2488, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.08203125, + "rewards/margins": 9.9140625, + "rewards/rejected": -6.828125, + "step": 3974 + }, + { + "epoch": 0.788729599682524, + "grad_norm": 23.582759892881377, + "learning_rate": 2.1717439759039668e-07, + "logits/chosen": 3.765625, + "logits/rejected": 3.94921875, + "logps/chosen": -1018.0, + "logps/rejected": -1524.0, + "loss": 0.5094, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.708984375, + "rewards/margins": 8.87109375, + "rewards/rejected": -6.13671875, + "step": 3975 + }, + { + "epoch": 0.7889280222233246, + "grad_norm": 33.99403808073027, + "learning_rate": 2.1696471597011735e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 4.0625, + "logps/chosen": -992.0, + "logps/rejected": -855.0, + "loss": 0.4646, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.28125, + "rewards/margins": 7.3984375, + "rewards/rejected": -5.11328125, + "step": 3976 + }, + { + "epoch": 0.7891264447641252, + "grad_norm": 30.754519324600018, + "learning_rate": 2.16755194100948e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.20703125, + "logps/chosen": -991.0, + "logps/rejected": -1066.0, + "loss": 0.4435, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6630859375, + "rewards/margins": 8.375, + "rewards/rejected": -5.70703125, + "step": 3977 + }, + { + "epoch": 0.7893248673049258, + "grad_norm": 27.375704327372773, + "learning_rate": 2.165458320833925e-07, + "logits/chosen": 4.140625, + "logits/rejected": 3.90625, + "logps/chosen": -1074.0, + "logps/rejected": -920.5, + "loss": 0.3812, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.94775390625, + "rewards/margins": 7.72265625, + "rewards/rejected": -5.78125, + "step": 3978 + }, + { + "epoch": 0.7895232898457265, + "grad_norm": 36.016766597100606, + "learning_rate": 2.1633663001787798e-07, + "logits/chosen": 3.85546875, + "logits/rejected": 3.72265625, + "logps/chosen": -1191.0, + "logps/rejected": -794.0, + "loss": 0.4467, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.052734375, + "rewards/margins": 7.828125, + "rewards/rejected": -5.7734375, + "step": 3979 + }, + { + "epoch": 0.7897217123865271, + "grad_norm": 32.16561158383144, + "learning_rate": 2.1612758800475522e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 3.96875, + "logps/chosen": -1073.0, + "logps/rejected": -682.0, + "loss": 0.3022, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.95703125, + "rewards/margins": 9.421875, + "rewards/rejected": -6.4609375, + "step": 3980 + }, + { + "epoch": 0.7899201349273277, + "grad_norm": 35.99924267818898, + "learning_rate": 2.1591870614429781e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.34375, + "logps/chosen": -684.0, + "logps/rejected": -950.5, + "loss": 0.5427, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.45556640625, + "rewards/margins": 6.67578125, + "rewards/rejected": -5.22265625, + "step": 3981 + }, + { + "epoch": 0.7901185574681284, + "grad_norm": 32.348956064262396, + "learning_rate": 2.1570998453670264e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 3.92578125, + "logps/chosen": -829.0, + "logps/rejected": -673.0, + "loss": 0.477, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.74609375, + "rewards/margins": 7.65625, + "rewards/rejected": -5.91015625, + "step": 3982 + }, + { + "epoch": 0.790316980008929, + "grad_norm": 30.13516201620524, + "learning_rate": 2.1550142328208991e-07, + "logits/chosen": 3.63671875, + "logits/rejected": 4.0546875, + "logps/chosen": -852.5, + "logps/rejected": -776.0, + "loss": 0.3874, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.53125, + "rewards/margins": 8.421875, + "rewards/rejected": -5.88671875, + "step": 3983 + }, + { + "epoch": 0.7905154025497296, + "grad_norm": 39.006917376924584, + "learning_rate": 2.1529302248050257e-07, + "logits/chosen": 4.125, + "logits/rejected": 3.9453125, + "logps/chosen": -740.5, + "logps/rejected": -620.5, + "loss": 0.442, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.923828125, + "rewards/margins": 7.1484375, + "rewards/rejected": -5.23046875, + "step": 3984 + }, + { + "epoch": 0.7907138250905302, + "grad_norm": 33.81568814192288, + "learning_rate": 2.1508478223190674e-07, + "logits/chosen": 3.53515625, + "logits/rejected": 3.828125, + "logps/chosen": -669.0, + "logps/rejected": -512.0, + "loss": 0.5338, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.644287109375, + "rewards/margins": 5.40625, + "rewards/rejected": -3.765625, + "step": 3985 + }, + { + "epoch": 0.790912247631331, + "grad_norm": 34.20127098857533, + "learning_rate": 2.1487670263619194e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 3.94921875, + "logps/chosen": -926.0, + "logps/rejected": -687.0, + "loss": 0.3622, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.548828125, + "rewards/margins": 8.8984375, + "rewards/rejected": -6.3359375, + "step": 3986 + }, + { + "epoch": 0.7911106701721315, + "grad_norm": 28.7605472967344, + "learning_rate": 2.146687837931697e-07, + "logits/chosen": 3.73046875, + "logits/rejected": 3.546875, + "logps/chosen": -913.0, + "logps/rejected": -582.0, + "loss": 0.3857, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.88671875, + "rewards/margins": 8.5625, + "rewards/rejected": -5.6875, + "step": 3987 + }, + { + "epoch": 0.7913090927129321, + "grad_norm": 22.4302684751119, + "learning_rate": 2.1446102580257546e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.5390625, + "logps/chosen": -1341.0, + "logps/rejected": -1296.0, + "loss": 0.3121, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2109375, + "rewards/margins": 13.140625, + "rewards/rejected": -9.953125, + "step": 3988 + }, + { + "epoch": 0.7915075152537329, + "grad_norm": 34.34583120495761, + "learning_rate": 2.1425342876406694e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 4.2109375, + "logps/chosen": -833.0, + "logps/rejected": -1045.0, + "loss": 0.4623, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.48095703125, + "rewards/margins": 8.2890625, + "rewards/rejected": -5.80859375, + "step": 3989 + }, + { + "epoch": 0.7917059377945335, + "grad_norm": 32.76960049498461, + "learning_rate": 2.1404599277722447e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.1796875, + "logps/chosen": -1322.0, + "logps/rejected": -879.0, + "loss": 0.3151, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.04296875, + "rewards/margins": 9.8828125, + "rewards/rejected": -6.8359375, + "step": 3990 + }, + { + "epoch": 0.7919043603353341, + "grad_norm": 31.674891854654852, + "learning_rate": 2.138387179415519e-07, + "logits/chosen": 3.953125, + "logits/rejected": 4.03515625, + "logps/chosen": -990.0, + "logps/rejected": -1058.5, + "loss": 0.4449, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.0390625, + "rewards/margins": 8.4765625, + "rewards/rejected": -6.44921875, + "step": 3991 + }, + { + "epoch": 0.7921027828761348, + "grad_norm": 22.33714736836066, + "learning_rate": 2.1363160435647473e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.24609375, + "logps/chosen": -1265.0, + "logps/rejected": -753.0, + "loss": 0.3577, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.779296875, + "rewards/margins": 8.43359375, + "rewards/rejected": -5.662109375, + "step": 3992 + }, + { + "epoch": 0.7923012054169354, + "grad_norm": 35.298327295674696, + "learning_rate": 2.1342465212134213e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.48828125, + "logps/chosen": -1055.0, + "logps/rejected": -833.0, + "loss": 0.3053, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3828125, + "rewards/margins": 10.375, + "rewards/rejected": -7.984375, + "step": 3993 + }, + { + "epoch": 0.792499627957736, + "grad_norm": 38.314405461775245, + "learning_rate": 2.1321786133542518e-07, + "logits/chosen": 3.83984375, + "logits/rejected": 3.6484375, + "logps/chosen": -808.5, + "logps/rejected": -720.5, + "loss": 0.3816, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.15625, + "rewards/margins": 8.8125, + "rewards/rejected": -6.65625, + "step": 3994 + }, + { + "epoch": 0.7926980504985366, + "grad_norm": 28.43750160590186, + "learning_rate": 2.130112320979177e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.07421875, + "logps/chosen": -852.0, + "logps/rejected": -791.5, + "loss": 0.3034, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.384765625, + "rewards/margins": 10.5546875, + "rewards/rejected": -8.171875, + "step": 3995 + }, + { + "epoch": 0.7928964730393373, + "grad_norm": 21.772521382349066, + "learning_rate": 2.1280476450793633e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 3.546875, + "logps/chosen": -1066.0, + "logps/rejected": -744.5, + "loss": 0.3354, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.90625, + "rewards/margins": 7.921875, + "rewards/rejected": -5.03125, + "step": 3996 + }, + { + "epoch": 0.7930948955801379, + "grad_norm": 42.89793085034916, + "learning_rate": 2.1259845866451956e-07, + "logits/chosen": 4.5078125, + "logits/rejected": 4.1171875, + "logps/chosen": -1246.0, + "logps/rejected": -1078.5, + "loss": 0.3608, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6796875, + "rewards/margins": 9.4296875, + "rewards/rejected": -6.76953125, + "step": 3997 + }, + { + "epoch": 0.7932933181209385, + "grad_norm": 30.220006297478328, + "learning_rate": 2.123923146666291e-07, + "logits/chosen": 3.734375, + "logits/rejected": 3.68359375, + "logps/chosen": -925.0, + "logps/rejected": -889.0, + "loss": 0.5744, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.341796875, + "rewards/margins": 7.671875, + "rewards/rejected": -5.328125, + "step": 3998 + }, + { + "epoch": 0.7934917406617392, + "grad_norm": 26.548598265655734, + "learning_rate": 2.121863326131479e-07, + "logits/chosen": 3.6875, + "logits/rejected": 4.046875, + "logps/chosen": -1225.0, + "logps/rejected": -1025.0, + "loss": 0.175, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3828125, + "rewards/margins": 11.0625, + "rewards/rejected": -7.6796875, + "step": 3999 + }, + { + "epoch": 0.7936901632025398, + "grad_norm": 25.73002266156707, + "learning_rate": 2.1198051260288248e-07, + "logits/chosen": 4.1875, + "logits/rejected": 3.8671875, + "logps/chosen": -871.0, + "logps/rejected": -591.0, + "loss": 0.3959, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8671875, + "rewards/margins": 8.9609375, + "rewards/rejected": -6.1015625, + "step": 4000 + }, + { + "epoch": 0.7938885857433404, + "grad_norm": 27.95741267508689, + "learning_rate": 2.117748547345608e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 3.99609375, + "logps/chosen": -1680.0, + "logps/rejected": -1406.0, + "loss": 0.247, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.89453125, + "rewards/margins": 12.3125, + "rewards/rejected": -8.4140625, + "step": 4001 + }, + { + "epoch": 0.794087008284141, + "grad_norm": 25.516221020991846, + "learning_rate": 2.1156935910683337e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.0390625, + "logps/chosen": -1092.0, + "logps/rejected": -794.0, + "loss": 0.4076, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.552734375, + "rewards/margins": 8.25, + "rewards/rejected": -5.703125, + "step": 4002 + }, + { + "epoch": 0.7942854308249417, + "grad_norm": 38.32157995919037, + "learning_rate": 2.1136402581827273e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.76171875, + "logps/chosen": -1058.0, + "logps/rejected": -1053.5, + "loss": 0.44, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.00390625, + "rewards/margins": 8.2734375, + "rewards/rejected": -6.28515625, + "step": 4003 + }, + { + "epoch": 0.7944838533657423, + "grad_norm": 30.897805776527495, + "learning_rate": 2.1115885496737345e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.046875, + "logps/chosen": -1203.0, + "logps/rejected": -695.0, + "loss": 0.4772, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.80859375, + "rewards/margins": 6.14453125, + "rewards/rejected": -4.3203125, + "step": 4004 + }, + { + "epoch": 0.794682275906543, + "grad_norm": 24.49306029357323, + "learning_rate": 2.1095384665255267e-07, + "logits/chosen": 3.984375, + "logits/rejected": 3.66796875, + "logps/chosen": -567.0, + "logps/rejected": -551.0, + "loss": 0.4943, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.025390625, + "rewards/margins": 6.2734375, + "rewards/rejected": -4.25390625, + "step": 4005 + }, + { + "epoch": 0.7948806984473437, + "grad_norm": 41.7223013883424, + "learning_rate": 2.107490009721491e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.25, + "logps/chosen": -1105.5, + "logps/rejected": -937.0, + "loss": 0.2913, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.380859375, + "rewards/margins": 13.84765625, + "rewards/rejected": -11.470703125, + "step": 4006 + }, + { + "epoch": 0.7950791209881443, + "grad_norm": 31.91559775215073, + "learning_rate": 2.1054431802442345e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 4.1640625, + "logps/chosen": -1016.5, + "logps/rejected": -828.0, + "loss": 0.3091, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.224609375, + "rewards/margins": 7.8203125, + "rewards/rejected": -5.6015625, + "step": 4007 + }, + { + "epoch": 0.7952775435289449, + "grad_norm": 37.61466181236503, + "learning_rate": 2.103397979075587e-07, + "logits/chosen": 3.78515625, + "logits/rejected": 3.45703125, + "logps/chosen": -984.0, + "logps/rejected": -533.0, + "loss": 0.3974, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.220703125, + "rewards/margins": 6.1015625, + "rewards/rejected": -3.884765625, + "step": 4008 + }, + { + "epoch": 0.7954759660697456, + "grad_norm": 25.775432005807353, + "learning_rate": 2.1013544071965942e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 3.97265625, + "logps/chosen": -996.0, + "logps/rejected": -648.0, + "loss": 0.5071, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.24609375, + "rewards/margins": 6.375, + "rewards/rejected": -4.140625, + "step": 4009 + }, + { + "epoch": 0.7956743886105462, + "grad_norm": 21.189084833017315, + "learning_rate": 2.0993124655875195e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.19921875, + "logps/chosen": -961.0, + "logps/rejected": -634.5, + "loss": 0.5155, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.50830078125, + "rewards/margins": 6.927734375, + "rewards/rejected": -4.427734375, + "step": 4010 + }, + { + "epoch": 0.7958728111513468, + "grad_norm": 32.14580262522851, + "learning_rate": 2.097272155227849e-07, + "logits/chosen": 3.7421875, + "logits/rejected": 4.0859375, + "logps/chosen": -956.0, + "logps/rejected": -957.0, + "loss": 0.5126, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3046875, + "rewards/margins": 7.708984375, + "rewards/rejected": -5.39453125, + "step": 4011 + }, + { + "epoch": 0.7960712336921474, + "grad_norm": 29.087667026369388, + "learning_rate": 2.0952334770962822e-07, + "logits/chosen": 3.75, + "logits/rejected": 3.60546875, + "logps/chosen": -781.5, + "logps/rejected": -633.0, + "loss": 0.5125, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.2578125, + "rewards/margins": 5.5546875, + "rewards/rejected": -3.3056640625, + "step": 4012 + }, + { + "epoch": 0.7962696562329481, + "grad_norm": 34.50338449743132, + "learning_rate": 2.0931964321707363e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 4.28125, + "logps/chosen": -994.0, + "logps/rejected": -1416.0, + "loss": 0.3487, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8359375, + "rewards/margins": 10.375, + "rewards/rejected": -7.55078125, + "step": 4013 + }, + { + "epoch": 0.7964680787737487, + "grad_norm": 31.691241237890726, + "learning_rate": 2.0911610214283464e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.8203125, + "logps/chosen": -925.0, + "logps/rejected": -902.0, + "loss": 0.4277, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.55859375, + "rewards/margins": 8.46875, + "rewards/rejected": -5.92578125, + "step": 4014 + }, + { + "epoch": 0.7966665013145493, + "grad_norm": 26.363843128750357, + "learning_rate": 2.0891272458454608e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.1171875, + "logps/chosen": -812.0, + "logps/rejected": -699.5, + "loss": 0.4517, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.79296875, + "rewards/margins": 9.0234375, + "rewards/rejected": -7.23828125, + "step": 4015 + }, + { + "epoch": 0.79686492385535, + "grad_norm": 34.97453084479181, + "learning_rate": 2.08709510639765e-07, + "logits/chosen": 4.015625, + "logits/rejected": 4.08203125, + "logps/chosen": -1343.0, + "logps/rejected": -933.0, + "loss": 0.4316, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.107421875, + "rewards/margins": 7.3359375, + "rewards/rejected": -5.220703125, + "step": 4016 + }, + { + "epoch": 0.7970633463961506, + "grad_norm": 30.6217800600957, + "learning_rate": 2.0850646040596897e-07, + "logits/chosen": 3.9375, + "logits/rejected": 3.73046875, + "logps/chosen": -937.5, + "logps/rejected": -600.0, + "loss": 0.3921, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.49609375, + "rewards/margins": 6.03125, + "rewards/rejected": -3.5390625, + "step": 4017 + }, + { + "epoch": 0.7972617689369512, + "grad_norm": 28.616093656373728, + "learning_rate": 2.0830357398055805e-07, + "logits/chosen": 3.953125, + "logits/rejected": 3.734375, + "logps/chosen": -1128.0, + "logps/rejected": -737.0, + "loss": 0.3166, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.513671875, + "rewards/margins": 8.1796875, + "rewards/rejected": -5.6484375, + "step": 4018 + }, + { + "epoch": 0.7974601914777518, + "grad_norm": 25.66820850049508, + "learning_rate": 2.0810085146085282e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 3.92578125, + "logps/chosen": -1322.0, + "logps/rejected": -809.0, + "loss": 0.3378, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.611328125, + "rewards/margins": 9.890625, + "rewards/rejected": -7.28125, + "step": 4019 + }, + { + "epoch": 0.7976586140185525, + "grad_norm": 25.2995803503171, + "learning_rate": 2.07898292944096e-07, + "logits/chosen": 4.5, + "logits/rejected": 3.99609375, + "logps/chosen": -716.0, + "logps/rejected": -555.5, + "loss": 0.4722, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.353515625, + "rewards/margins": 6.9765625, + "rewards/rejected": -4.634765625, + "step": 4020 + }, + { + "epoch": 0.7978570365593531, + "grad_norm": 23.733871258759212, + "learning_rate": 2.0769589852745124e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 3.953125, + "logps/chosen": -1545.0, + "logps/rejected": -771.5, + "loss": 0.3873, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.3515625, + "rewards/margins": 6.203125, + "rewards/rejected": -6.54296875, + "step": 4021 + }, + { + "epoch": 0.7980554591001537, + "grad_norm": 21.90452762151463, + "learning_rate": 2.0749366830800337e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 3.890625, + "logps/chosen": -874.0, + "logps/rejected": -745.0, + "loss": 0.3998, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8046875, + "rewards/margins": 8.5, + "rewards/rejected": -5.68359375, + "step": 4022 + }, + { + "epoch": 0.7982538816409545, + "grad_norm": 35.75719696068732, + "learning_rate": 2.07291602382759e-07, + "logits/chosen": 3.578125, + "logits/rejected": 3.4375, + "logps/chosen": -778.0, + "logps/rejected": -686.0, + "loss": 0.4108, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.814453125, + "rewards/margins": 7.38671875, + "rewards/rejected": -5.56640625, + "step": 4023 + }, + { + "epoch": 0.7984523041817551, + "grad_norm": 22.103255642451526, + "learning_rate": 2.0708970084864512e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.3203125, + "logps/chosen": -1085.0, + "logps/rejected": -1516.0, + "loss": 0.2125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3828125, + "rewards/margins": 13.609375, + "rewards/rejected": -10.234375, + "step": 4024 + }, + { + "epoch": 0.7986507267225557, + "grad_norm": 37.50464241816457, + "learning_rate": 2.068879638025107e-07, + "logits/chosen": 3.71875, + "logits/rejected": 3.79296875, + "logps/chosen": -789.0, + "logps/rejected": -699.0, + "loss": 0.3316, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.46484375, + "rewards/margins": 7.3828125, + "rewards/rejected": -4.921875, + "step": 4025 + }, + { + "epoch": 0.7988491492633563, + "grad_norm": 36.665932991330465, + "learning_rate": 2.0668639134112525e-07, + "logits/chosen": 3.6328125, + "logits/rejected": 3.6171875, + "logps/chosen": -674.0, + "logps/rejected": -456.5, + "loss": 0.4521, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.1583251953125, + "rewards/margins": 6.8359375, + "rewards/rejected": -5.6796875, + "step": 4026 + }, + { + "epoch": 0.799047571804157, + "grad_norm": 46.26829875585624, + "learning_rate": 2.0648498356117932e-07, + "logits/chosen": 3.6875, + "logits/rejected": 3.71484375, + "logps/chosen": -1060.0, + "logps/rejected": -697.0, + "loss": 0.3914, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.33203125, + "rewards/margins": 6.7109375, + "rewards/rejected": -4.375, + "step": 4027 + }, + { + "epoch": 0.7992459943449576, + "grad_norm": 29.67412687340163, + "learning_rate": 2.0628374055928511e-07, + "logits/chosen": 3.765625, + "logits/rejected": 3.71875, + "logps/chosen": -953.0, + "logps/rejected": -1606.0, + "loss": 0.3263, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.37890625, + "rewards/margins": 11.7734375, + "rewards/rejected": -9.4140625, + "step": 4028 + }, + { + "epoch": 0.7994444168857582, + "grad_norm": 35.30106500503529, + "learning_rate": 2.0608266243197476e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 4.02734375, + "logps/chosen": -730.0, + "logps/rejected": -516.5, + "loss": 0.4425, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.416015625, + "rewards/margins": 6.7890625, + "rewards/rejected": -4.36328125, + "step": 4029 + }, + { + "epoch": 0.7996428394265589, + "grad_norm": 36.413735955075936, + "learning_rate": 2.0588174927570224e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 3.85546875, + "logps/chosen": -992.0, + "logps/rejected": -936.0, + "loss": 0.4012, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.431640625, + "rewards/margins": 9.2734375, + "rewards/rejected": -6.8359375, + "step": 4030 + }, + { + "epoch": 0.7998412619673595, + "grad_norm": 23.83376875981711, + "learning_rate": 2.0568100118684195e-07, + "logits/chosen": 4.328125, + "logits/rejected": 4.44140625, + "logps/chosen": -1211.0, + "logps/rejected": -834.0, + "loss": 0.2365, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.35546875, + "rewards/margins": 11.171875, + "rewards/rejected": -6.84375, + "step": 4031 + }, + { + "epoch": 0.8000396845081601, + "grad_norm": 41.18496064745066, + "learning_rate": 2.0548041826168915e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.93359375, + "logps/chosen": -981.0, + "logps/rejected": -664.5, + "loss": 0.3793, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.458984375, + "rewards/margins": 8.3671875, + "rewards/rejected": -5.90234375, + "step": 4032 + }, + { + "epoch": 0.8002381070489608, + "grad_norm": 28.49138655443912, + "learning_rate": 2.0528000059645995e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 3.80859375, + "logps/chosen": -1064.0, + "logps/rejected": -817.5, + "loss": 0.3511, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8359375, + "rewards/margins": 9.0390625, + "rewards/rejected": -6.20703125, + "step": 4033 + }, + { + "epoch": 0.8004365295897614, + "grad_norm": 24.36814451950417, + "learning_rate": 2.0507974828729107e-07, + "logits/chosen": 4.35546875, + "logits/rejected": 3.91796875, + "logps/chosen": -2023.0, + "logps/rejected": -1544.0, + "loss": 0.3736, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.52734375, + "rewards/margins": 8.0078125, + "rewards/rejected": -9.5390625, + "step": 4034 + }, + { + "epoch": 0.800634952130562, + "grad_norm": 30.034712320502816, + "learning_rate": 2.048796614302399e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.3203125, + "logps/chosen": -1270.0, + "logps/rejected": -1005.0, + "loss": 0.2945, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.046875, + "rewards/margins": 11.015625, + "rewards/rejected": -7.9609375, + "step": 4035 + }, + { + "epoch": 0.8008333746713626, + "grad_norm": 25.79629730055835, + "learning_rate": 2.0467974012128483e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 3.71484375, + "logps/chosen": -1132.0, + "logps/rejected": -707.0, + "loss": 0.2653, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.59375, + "rewards/margins": 9.9453125, + "rewards/rejected": -6.33203125, + "step": 4036 + }, + { + "epoch": 0.8010317972121633, + "grad_norm": 22.534119406850287, + "learning_rate": 2.0447998445632437e-07, + "logits/chosen": 3.859375, + "logits/rejected": 4.0703125, + "logps/chosen": -810.0, + "logps/rejected": -614.0, + "loss": 0.4427, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.65234375, + "rewards/margins": 6.9453125, + "rewards/rejected": -4.3046875, + "step": 4037 + }, + { + "epoch": 0.8012302197529639, + "grad_norm": 32.77735328044274, + "learning_rate": 2.0428039453117786e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 4.015625, + "logps/chosen": -1053.0, + "logps/rejected": -768.0, + "loss": 0.2983, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.859375, + "rewards/margins": 8.953125, + "rewards/rejected": -6.08984375, + "step": 4038 + }, + { + "epoch": 0.8014286422937645, + "grad_norm": 27.905864237761563, + "learning_rate": 2.0408097044158497e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 3.81640625, + "logps/chosen": -1164.0, + "logps/rejected": -900.0, + "loss": 0.3277, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4453125, + "rewards/margins": 9.265625, + "rewards/rejected": -6.8203125, + "step": 4039 + }, + { + "epoch": 0.8016270648345653, + "grad_norm": 34.81294645992447, + "learning_rate": 2.038817122832058e-07, + "logits/chosen": 3.7421875, + "logits/rejected": 3.3359375, + "logps/chosen": -1045.5, + "logps/rejected": -528.5, + "loss": 0.4543, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.66259765625, + "rewards/margins": 7.03125, + "rewards/rejected": -5.35546875, + "step": 4040 + }, + { + "epoch": 0.8018254873753659, + "grad_norm": 25.691192267770756, + "learning_rate": 2.0368262015162113e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 3.8359375, + "logps/chosen": -923.0, + "logps/rejected": -1093.0, + "loss": 0.4267, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.453125, + "rewards/margins": 8.9921875, + "rewards/rejected": -6.5390625, + "step": 4041 + }, + { + "epoch": 0.8020239099161665, + "grad_norm": 32.547674031204764, + "learning_rate": 2.0348369414233174e-07, + "logits/chosen": 3.90625, + "logits/rejected": 4.28125, + "logps/chosen": -888.0, + "logps/rejected": -590.5, + "loss": 0.5623, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.859375, + "rewards/margins": 6.45703125, + "rewards/rejected": -4.58984375, + "step": 4042 + }, + { + "epoch": 0.8022223324569671, + "grad_norm": 22.61505361786921, + "learning_rate": 2.032849343507591e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.9140625, + "logps/chosen": -1049.0, + "logps/rejected": -1008.0, + "loss": 0.2149, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.90234375, + "rewards/margins": 11.640625, + "rewards/rejected": -8.734375, + "step": 4043 + }, + { + "epoch": 0.8024207549977678, + "grad_norm": 28.39004107061117, + "learning_rate": 2.0308634087224466e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.1875, + "logps/chosen": -1213.0, + "logps/rejected": -748.0, + "loss": 0.3314, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.77734375, + "rewards/margins": 9.71875, + "rewards/rejected": -6.953125, + "step": 4044 + }, + { + "epoch": 0.8026191775385684, + "grad_norm": 33.328416087478814, + "learning_rate": 2.0288791380205016e-07, + "logits/chosen": 3.7109375, + "logits/rejected": 3.60546875, + "logps/chosen": -844.0, + "logps/rejected": -584.5, + "loss": 0.433, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.520751953125, + "rewards/margins": 7.484375, + "rewards/rejected": -4.953125, + "step": 4045 + }, + { + "epoch": 0.802817600079369, + "grad_norm": 31.977219116712835, + "learning_rate": 2.026896532353576e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 3.859375, + "logps/chosen": -1036.0, + "logps/rejected": -686.0, + "loss": 0.3291, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.458984375, + "rewards/margins": 8.5546875, + "rewards/rejected": -6.109375, + "step": 4046 + }, + { + "epoch": 0.8030160226201697, + "grad_norm": 33.5559314866224, + "learning_rate": 2.0249155926726886e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 3.75, + "logps/chosen": -1047.0, + "logps/rejected": -604.0, + "loss": 0.3711, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.69921875, + "rewards/margins": 8.3515625, + "rewards/rejected": -5.65234375, + "step": 4047 + }, + { + "epoch": 0.8032144451609703, + "grad_norm": 36.78639591470322, + "learning_rate": 2.0229363199280653e-07, + "logits/chosen": 4.28125, + "logits/rejected": 3.9453125, + "logps/chosen": -980.0, + "logps/rejected": -554.5, + "loss": 0.3772, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0400390625, + "rewards/margins": 8.21875, + "rewards/rejected": -5.19921875, + "step": 4048 + }, + { + "epoch": 0.8034128677017709, + "grad_norm": 39.00936118636696, + "learning_rate": 2.0209587150691226e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 3.83203125, + "logps/chosen": -945.5, + "logps/rejected": -1432.5, + "loss": 0.4519, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.971923828125, + "rewards/margins": 9.421875, + "rewards/rejected": -7.458984375, + "step": 4049 + }, + { + "epoch": 0.8036112902425716, + "grad_norm": 36.53245464055404, + "learning_rate": 2.0189827790444862e-07, + "logits/chosen": 4.09375, + "logits/rejected": 3.90625, + "logps/chosen": -1076.5, + "logps/rejected": -934.5, + "loss": 0.485, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.58056640625, + "rewards/margins": 8.12890625, + "rewards/rejected": -6.53515625, + "step": 4050 + }, + { + "epoch": 0.8038097127833722, + "grad_norm": 33.11387713520894, + "learning_rate": 2.0170085128019768e-07, + "logits/chosen": 3.66015625, + "logits/rejected": 3.921875, + "logps/chosen": -802.0, + "logps/rejected": -799.0, + "loss": 0.4828, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.888671875, + "rewards/margins": 8.125, + "rewards/rejected": -6.2421875, + "step": 4051 + }, + { + "epoch": 0.8040081353241728, + "grad_norm": 35.72528951892017, + "learning_rate": 2.0150359172886132e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 3.44140625, + "logps/chosen": -950.0, + "logps/rejected": -567.5, + "loss": 0.3059, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.642333984375, + "rewards/margins": 8.6015625, + "rewards/rejected": -6.9453125, + "step": 4052 + }, + { + "epoch": 0.8042065578649734, + "grad_norm": 26.7659316336251, + "learning_rate": 2.013064993450618e-07, + "logits/chosen": 3.6484375, + "logits/rejected": 3.9296875, + "logps/chosen": -928.0, + "logps/rejected": -571.0, + "loss": 0.4613, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.561279296875, + "rewards/margins": 7.5390625, + "rewards/rejected": -5.96875, + "step": 4053 + }, + { + "epoch": 0.8044049804057741, + "grad_norm": 39.01528643956281, + "learning_rate": 2.0110957422334035e-07, + "logits/chosen": 3.85546875, + "logits/rejected": 3.65234375, + "logps/chosen": -1075.0, + "logps/rejected": -1615.0, + "loss": 0.3851, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.8662109375, + "rewards/margins": 11.125, + "rewards/rejected": -9.2578125, + "step": 4054 + }, + { + "epoch": 0.8046034029465747, + "grad_norm": 21.448998172959985, + "learning_rate": 2.0091281645815887e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 3.8515625, + "logps/chosen": -954.0, + "logps/rejected": -799.0, + "loss": 0.3668, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.90234375, + "rewards/margins": 9.9609375, + "rewards/rejected": -7.0390625, + "step": 4055 + }, + { + "epoch": 0.8048018254873753, + "grad_norm": 32.13191271657154, + "learning_rate": 2.0071622614389833e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.0390625, + "logps/chosen": -816.5, + "logps/rejected": -621.5, + "loss": 0.473, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.9248046875, + "rewards/margins": 7.3359375, + "rewards/rejected": -5.41015625, + "step": 4056 + }, + { + "epoch": 0.805000248028176, + "grad_norm": 30.191774834983057, + "learning_rate": 2.0051980337485975e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.8359375, + "logps/chosen": -875.5, + "logps/rejected": -772.0, + "loss": 0.3044, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.099609375, + "rewards/margins": 9.375, + "rewards/rejected": -6.2734375, + "step": 4057 + }, + { + "epoch": 0.8051986705689766, + "grad_norm": 23.16709181088182, + "learning_rate": 2.0032354824526354e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 4.046875, + "logps/chosen": -1320.0, + "logps/rejected": -722.5, + "loss": 0.4518, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.4375, + "rewards/margins": 9.484375, + "rewards/rejected": -6.0625, + "step": 4058 + }, + { + "epoch": 0.8053970931097773, + "grad_norm": 31.138179079090566, + "learning_rate": 2.0012746084924974e-07, + "logits/chosen": 3.69140625, + "logits/rejected": 3.83203125, + "logps/chosen": -893.0, + "logps/rejected": -665.0, + "loss": 0.4145, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.14599609375, + "rewards/margins": 7.021484375, + "rewards/rejected": -4.88671875, + "step": 4059 + }, + { + "epoch": 0.8055955156505779, + "grad_norm": 28.706727409521754, + "learning_rate": 1.9993154128087834e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 4.03515625, + "logps/chosen": -766.0, + "logps/rejected": -718.0, + "loss": 0.2718, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.982421875, + "rewards/margins": 9.203125, + "rewards/rejected": -6.1953125, + "step": 4060 + }, + { + "epoch": 0.8057939381913786, + "grad_norm": 26.46080275716306, + "learning_rate": 1.99735789634128e-07, + "logits/chosen": 4.625, + "logits/rejected": 4.546875, + "logps/chosen": -1080.0, + "logps/rejected": -947.0, + "loss": 0.2009, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.828125, + "rewards/margins": 11.65625, + "rewards/rejected": -8.8203125, + "step": 4061 + }, + { + "epoch": 0.8059923607321792, + "grad_norm": 30.700412944603706, + "learning_rate": 1.9954020600289763e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 3.8203125, + "logps/chosen": -1115.0, + "logps/rejected": -605.0, + "loss": 0.3225, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.62890625, + "rewards/margins": 8.578125, + "rewards/rejected": -5.9453125, + "step": 4062 + }, + { + "epoch": 0.8061907832729798, + "grad_norm": 27.20651124118183, + "learning_rate": 1.993447904810051e-07, + "logits/chosen": 4.14453125, + "logits/rejected": 4.046875, + "logps/chosen": -1079.0, + "logps/rejected": -903.0, + "loss": 0.4054, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.513671875, + "rewards/margins": 7.3984375, + "rewards/rejected": -3.88671875, + "step": 4063 + }, + { + "epoch": 0.8063892058137805, + "grad_norm": 31.914200291039023, + "learning_rate": 1.9914954316218785e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 4.0625, + "logps/chosen": -1075.0, + "logps/rejected": -527.5, + "loss": 0.4368, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.507568359375, + "rewards/margins": 6.40234375, + "rewards/rejected": -4.89453125, + "step": 4064 + }, + { + "epoch": 0.8065876283545811, + "grad_norm": 31.7969949996727, + "learning_rate": 1.989544641401024e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 4.1796875, + "logps/chosen": -1018.0, + "logps/rejected": -1775.0, + "loss": 0.3968, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.984375, + "rewards/margins": 12.3125, + "rewards/rejected": -10.3359375, + "step": 4065 + }, + { + "epoch": 0.8067860508953817, + "grad_norm": 28.940436259002606, + "learning_rate": 1.987595535083249e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.0546875, + "logps/chosen": -1818.0, + "logps/rejected": -777.0, + "loss": 0.366, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.46875, + "rewards/margins": 6.7734375, + "rewards/rejected": -6.30078125, + "step": 4066 + }, + { + "epoch": 0.8069844734361824, + "grad_norm": 24.30931195653621, + "learning_rate": 1.985648113603502e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 3.8828125, + "logps/chosen": -886.0, + "logps/rejected": -590.0, + "loss": 0.3606, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8515625, + "rewards/margins": 8.8984375, + "rewards/rejected": -6.0625, + "step": 4067 + }, + { + "epoch": 0.807182895976983, + "grad_norm": 41.246470500298635, + "learning_rate": 1.9837023778959298e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.1875, + "logps/chosen": -954.0, + "logps/rejected": -722.0, + "loss": 0.3812, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.109375, + "rewards/margins": 7.828125, + "rewards/rejected": -5.703125, + "step": 4068 + }, + { + "epoch": 0.8073813185177836, + "grad_norm": 29.378597037819002, + "learning_rate": 1.9817583288938662e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 4.140625, + "logps/chosen": -1578.0, + "logps/rejected": -709.0, + "loss": 0.45, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.642578125, + "rewards/margins": 8.203125, + "rewards/rejected": -5.56640625, + "step": 4069 + }, + { + "epoch": 0.8075797410585842, + "grad_norm": 40.56234893088263, + "learning_rate": 1.9798159675298365e-07, + "logits/chosen": 4.0, + "logits/rejected": 3.6875, + "logps/chosen": -1062.0, + "logps/rejected": -671.0, + "loss": 0.4374, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.328125, + "rewards/margins": 7.92578125, + "rewards/rejected": -5.611328125, + "step": 4070 + }, + { + "epoch": 0.8077781635993849, + "grad_norm": 30.014652172921984, + "learning_rate": 1.9778752947355572e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 3.62890625, + "logps/chosen": -791.5, + "logps/rejected": -783.0, + "loss": 0.4126, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8349609375, + "rewards/margins": 8.5, + "rewards/rejected": -6.67578125, + "step": 4071 + }, + { + "epoch": 0.8079765861401855, + "grad_norm": 29.764555604225915, + "learning_rate": 1.9759363114419336e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 3.98046875, + "logps/chosen": -745.0, + "logps/rejected": -585.5, + "loss": 0.4669, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.6806640625, + "rewards/margins": 6.71484375, + "rewards/rejected": -5.03125, + "step": 4072 + }, + { + "epoch": 0.8081750086809861, + "grad_norm": 34.385917033644276, + "learning_rate": 1.9739990185790651e-07, + "logits/chosen": 4.015625, + "logits/rejected": 3.84765625, + "logps/chosen": -1002.0, + "logps/rejected": -635.5, + "loss": 0.4266, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.51953125, + "rewards/margins": 6.859375, + "rewards/rejected": -5.33984375, + "step": 4073 + }, + { + "epoch": 0.8083734312217868, + "grad_norm": 37.65887903157505, + "learning_rate": 1.9720634170762307e-07, + "logits/chosen": 4.22265625, + "logits/rejected": 3.7890625, + "logps/chosen": -1006.0, + "logps/rejected": -637.5, + "loss": 0.3985, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6796875, + "rewards/margins": 7.6171875, + "rewards/rejected": -4.943359375, + "step": 4074 + }, + { + "epoch": 0.8085718537625874, + "grad_norm": 26.49020188658046, + "learning_rate": 1.9701295078619094e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 3.7265625, + "logps/chosen": -939.5, + "logps/rejected": -581.25, + "loss": 0.3298, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.2734375, + "rewards/margins": 8.046875, + "rewards/rejected": -5.78125, + "step": 4075 + }, + { + "epoch": 0.808770276303388, + "grad_norm": 29.939668047901712, + "learning_rate": 1.9681972918637613e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 4.07421875, + "logps/chosen": -1029.0, + "logps/rejected": -1160.0, + "loss": 0.4105, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.119140625, + "rewards/margins": 10.21875, + "rewards/rejected": -8.109375, + "step": 4076 + }, + { + "epoch": 0.8089686988441886, + "grad_norm": 34.22473172608372, + "learning_rate": 1.966266770008634e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.67578125, + "logps/chosen": -734.0, + "logps/rejected": -1716.5, + "loss": 0.4295, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.81640625, + "rewards/margins": 12.1015625, + "rewards/rejected": -10.3046875, + "step": 4077 + }, + { + "epoch": 0.8091671213849894, + "grad_norm": 33.12157317354471, + "learning_rate": 1.9643379432225693e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.2890625, + "logps/chosen": -1279.0, + "logps/rejected": -1736.0, + "loss": 0.4589, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.4228515625, + "rewards/margins": 9.6484375, + "rewards/rejected": -7.24609375, + "step": 4078 + }, + { + "epoch": 0.80936554392579, + "grad_norm": 47.12539056879148, + "learning_rate": 1.9624108124307857e-07, + "logits/chosen": 3.70703125, + "logits/rejected": 3.640625, + "logps/chosen": -1027.0, + "logps/rejected": -617.5, + "loss": 0.3776, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.056640625, + "rewards/margins": 7.2421875, + "rewards/rejected": -5.177734375, + "step": 4079 + }, + { + "epoch": 0.8095639664665906, + "grad_norm": 27.675262952520136, + "learning_rate": 1.9604853785576975e-07, + "logits/chosen": 3.50390625, + "logits/rejected": 3.68359375, + "logps/chosen": -1266.0, + "logps/rejected": -927.0, + "loss": 0.3532, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1171875, + "rewards/margins": 9.9453125, + "rewards/rejected": -6.828125, + "step": 4080 + }, + { + "epoch": 0.8097623890073913, + "grad_norm": 31.924326332268276, + "learning_rate": 1.9585616425269003e-07, + "logits/chosen": 3.83984375, + "logits/rejected": 4.0078125, + "logps/chosen": -989.0, + "logps/rejected": -696.5, + "loss": 0.3659, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.0302734375, + "rewards/margins": 8.5625, + "rewards/rejected": -6.53125, + "step": 4081 + }, + { + "epoch": 0.8099608115481919, + "grad_norm": 33.75348329417101, + "learning_rate": 1.956639605261175e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.32421875, + "logps/chosen": -705.5, + "logps/rejected": -661.5, + "loss": 0.5818, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.115234375, + "rewards/margins": 6.6640625, + "rewards/rejected": -4.5546875, + "step": 4082 + }, + { + "epoch": 0.8101592340889925, + "grad_norm": 31.038349185661463, + "learning_rate": 1.9547192676824894e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.796875, + "logps/chosen": -1205.0, + "logps/rejected": -1134.0, + "loss": 0.3675, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0625, + "rewards/margins": 10.5234375, + "rewards/rejected": -7.4296875, + "step": 4083 + }, + { + "epoch": 0.8103576566297932, + "grad_norm": 40.19197816869314, + "learning_rate": 1.9528006307119942e-07, + "logits/chosen": 3.69921875, + "logits/rejected": 3.7578125, + "logps/chosen": -976.0, + "logps/rejected": -802.0, + "loss": 0.4465, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.888671875, + "rewards/margins": 9.09375, + "rewards/rejected": -7.19921875, + "step": 4084 + }, + { + "epoch": 0.8105560791705938, + "grad_norm": 36.36544123460036, + "learning_rate": 1.950883695270029e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 3.72265625, + "logps/chosen": -790.0, + "logps/rejected": -551.0, + "loss": 0.4506, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.81640625, + "rewards/margins": 5.46875, + "rewards/rejected": -3.63671875, + "step": 4085 + }, + { + "epoch": 0.8107545017113944, + "grad_norm": 20.23136273383764, + "learning_rate": 1.9489684622761093e-07, + "logits/chosen": 3.65625, + "logits/rejected": 3.65234375, + "logps/chosen": -1082.0, + "logps/rejected": -588.0, + "loss": 0.3934, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.94189453125, + "rewards/margins": 7.4921875, + "rewards/rejected": -5.546875, + "step": 4086 + }, + { + "epoch": 0.810952924252195, + "grad_norm": 34.62674842430145, + "learning_rate": 1.947054932648941e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 4.05078125, + "logps/chosen": -877.0, + "logps/rejected": -529.0, + "loss": 0.4782, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.6416015625, + "rewards/margins": 6.376953125, + "rewards/rejected": -4.74609375, + "step": 4087 + }, + { + "epoch": 0.8111513467929957, + "grad_norm": 28.02811113931961, + "learning_rate": 1.9451431073064096e-07, + "logits/chosen": 4.6328125, + "logits/rejected": 4.43359375, + "logps/chosen": -1199.0, + "logps/rejected": -921.0, + "loss": 0.3119, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.2890625, + "rewards/margins": 10.359375, + "rewards/rejected": -7.0703125, + "step": 4088 + }, + { + "epoch": 0.8113497693337963, + "grad_norm": 32.54972715239939, + "learning_rate": 1.9432329871655836e-07, + "logits/chosen": 3.921875, + "logits/rejected": 3.80859375, + "logps/chosen": -777.0, + "logps/rejected": -759.0, + "loss": 0.518, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.07421875, + "rewards/margins": 6.5703125, + "rewards/rejected": -4.48828125, + "step": 4089 + }, + { + "epoch": 0.8115481918745969, + "grad_norm": 24.408586793401447, + "learning_rate": 1.941324573142714e-07, + "logits/chosen": 3.85546875, + "logits/rejected": 3.64453125, + "logps/chosen": -1158.0, + "logps/rejected": -756.0, + "loss": 0.4252, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.23828125, + "rewards/margins": 8.578125, + "rewards/rejected": -5.328125, + "step": 4090 + }, + { + "epoch": 0.8117466144153976, + "grad_norm": 29.23049127596647, + "learning_rate": 1.939417866153232e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 4.08203125, + "logps/chosen": -1178.0, + "logps/rejected": -1335.0, + "loss": 0.2996, + "rewards/accuracies": 0.875, + "rewards/chosen": 4.4296875, + "rewards/margins": 14.34375, + "rewards/rejected": -9.8984375, + "step": 4091 + }, + { + "epoch": 0.8119450369561982, + "grad_norm": 31.99075255360814, + "learning_rate": 1.9375128671117527e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 3.87890625, + "logps/chosen": -931.0, + "logps/rejected": -610.5, + "loss": 0.5035, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.9296875, + "rewards/margins": 6.607421875, + "rewards/rejected": -4.6865234375, + "step": 4092 + }, + { + "epoch": 0.8121434594969988, + "grad_norm": 34.85073720322827, + "learning_rate": 1.9356095769320702e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.234375, + "logps/chosen": -1110.0, + "logps/rejected": -798.0, + "loss": 0.3229, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.185546875, + "rewards/margins": 8.7265625, + "rewards/rejected": -5.5390625, + "step": 4093 + }, + { + "epoch": 0.8123418820377994, + "grad_norm": 25.19741903630036, + "learning_rate": 1.9337079965271578e-07, + "logits/chosen": 3.90625, + "logits/rejected": 4.1015625, + "logps/chosen": -1172.0, + "logps/rejected": -791.5, + "loss": 0.3424, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.982421875, + "rewards/margins": 9.984375, + "rewards/rejected": -7.0234375, + "step": 4094 + }, + { + "epoch": 0.8125403045786002, + "grad_norm": 29.913188713430927, + "learning_rate": 1.9318081268091707e-07, + "logits/chosen": 3.58984375, + "logits/rejected": 3.984375, + "logps/chosen": -1169.0, + "logps/rejected": -847.0, + "loss": 0.4588, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5263671875, + "rewards/margins": 8.46875, + "rewards/rejected": -5.93359375, + "step": 4095 + }, + { + "epoch": 0.8127387271194008, + "grad_norm": 35.037221310467615, + "learning_rate": 1.929909968689442e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 3.98828125, + "logps/chosen": -944.0, + "logps/rejected": -685.5, + "loss": 0.3787, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.23046875, + "rewards/margins": 8.734375, + "rewards/rejected": -6.5078125, + "step": 4096 + }, + { + "epoch": 0.8129371496602014, + "grad_norm": 36.026483916892566, + "learning_rate": 1.9280135230784845e-07, + "logits/chosen": 3.64453125, + "logits/rejected": 3.59765625, + "logps/chosen": -838.0, + "logps/rejected": -596.5, + "loss": 0.3768, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5390625, + "rewards/margins": 7.1796875, + "rewards/rejected": -4.62890625, + "step": 4097 + }, + { + "epoch": 0.8131355722010021, + "grad_norm": 25.802646153343183, + "learning_rate": 1.9261187908859916e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 4.30078125, + "logps/chosen": -839.0, + "logps/rejected": -746.0, + "loss": 0.4066, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.08203125, + "rewards/margins": 9.6796875, + "rewards/rejected": -6.5859375, + "step": 4098 + }, + { + "epoch": 0.8133339947418027, + "grad_norm": 28.686714016477378, + "learning_rate": 1.9242257730208284e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 4.0234375, + "logps/chosen": -1320.0, + "logps/rejected": -995.0, + "loss": 0.3578, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.020751953125, + "rewards/margins": 7.921875, + "rewards/rejected": -4.884765625, + "step": 4099 + }, + { + "epoch": 0.8135324172826033, + "grad_norm": 23.405563459288405, + "learning_rate": 1.922334470391046e-07, + "logits/chosen": 4.0, + "logits/rejected": 4.19921875, + "logps/chosen": -1010.0, + "logps/rejected": -851.0, + "loss": 0.3363, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.43359375, + "rewards/margins": 14.484375, + "rewards/rejected": -12.04296875, + "step": 4100 + }, + { + "epoch": 0.8137308398234039, + "grad_norm": 34.05709142686195, + "learning_rate": 1.9204448839038673e-07, + "logits/chosen": 3.88671875, + "logits/rejected": 3.8984375, + "logps/chosen": -1274.0, + "logps/rejected": -781.0, + "loss": 0.3044, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.16796875, + "rewards/margins": 9.234375, + "rewards/rejected": -6.046875, + "step": 4101 + }, + { + "epoch": 0.8139292623642046, + "grad_norm": 35.67373641985192, + "learning_rate": 1.9185570144656913e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 3.97265625, + "logps/chosen": -1005.0, + "logps/rejected": -1532.0, + "loss": 0.4295, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8572998046875, + "rewards/margins": 9.859375, + "rewards/rejected": -8.01953125, + "step": 4102 + }, + { + "epoch": 0.8141276849050052, + "grad_norm": 25.62585587229605, + "learning_rate": 1.9166708629820998e-07, + "logits/chosen": 3.76953125, + "logits/rejected": 3.54296875, + "logps/chosen": -1111.0, + "logps/rejected": -665.0, + "loss": 0.3772, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.873046875, + "rewards/margins": 7.6796875, + "rewards/rejected": -5.8046875, + "step": 4103 + }, + { + "epoch": 0.8143261074458058, + "grad_norm": 24.984697519909847, + "learning_rate": 1.914786430357842e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.2265625, + "logps/chosen": -1124.0, + "logps/rejected": -712.0, + "loss": 0.449, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.140625, + "rewards/margins": 9.078125, + "rewards/rejected": -5.9453125, + "step": 4104 + }, + { + "epoch": 0.8145245299866065, + "grad_norm": 36.4710284133098, + "learning_rate": 1.9129037174968505e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 4.1484375, + "logps/chosen": -1128.0, + "logps/rejected": -1364.0, + "loss": 0.3747, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.591796875, + "rewards/margins": 10.9296875, + "rewards/rejected": -8.3359375, + "step": 4105 + }, + { + "epoch": 0.8147229525274071, + "grad_norm": 27.119863030475333, + "learning_rate": 1.9110227253022237e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.87890625, + "logps/chosen": -1302.0, + "logps/rejected": -820.0, + "loss": 0.3022, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.2734375, + "rewards/margins": 10.015625, + "rewards/rejected": -6.734375, + "step": 4106 + }, + { + "epoch": 0.8149213750682077, + "grad_norm": 31.168309405982598, + "learning_rate": 1.909143454676246e-07, + "logits/chosen": 3.72265625, + "logits/rejected": 3.71875, + "logps/chosen": -1065.0, + "logps/rejected": -758.0, + "loss": 0.4334, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.146484375, + "rewards/margins": 9.8203125, + "rewards/rejected": -7.6796875, + "step": 4107 + }, + { + "epoch": 0.8151197976090084, + "grad_norm": 31.487413395521795, + "learning_rate": 1.9072659065203673e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.36328125, + "logps/chosen": -1444.0, + "logps/rejected": -1043.0, + "loss": 0.4153, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.296875, + "rewards/margins": 8.5, + "rewards/rejected": -6.203125, + "step": 4108 + }, + { + "epoch": 0.815318220149809, + "grad_norm": 34.40658985206582, + "learning_rate": 1.905390081735213e-07, + "logits/chosen": 4.125, + "logits/rejected": 3.7265625, + "logps/chosen": -989.0, + "logps/rejected": -576.0, + "loss": 0.3046, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.80859375, + "rewards/margins": 8.765625, + "rewards/rejected": -5.9453125, + "step": 4109 + }, + { + "epoch": 0.8155166426906096, + "grad_norm": 32.36426207858405, + "learning_rate": 1.9035159812205875e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 4.25, + "logps/chosen": -779.0, + "logps/rejected": -644.0, + "loss": 0.4318, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.365234375, + "rewards/margins": 8.671875, + "rewards/rejected": -6.296875, + "step": 4110 + }, + { + "epoch": 0.8157150652314102, + "grad_norm": 23.13063807181716, + "learning_rate": 1.9016436058754577e-07, + "logits/chosen": 3.796875, + "logits/rejected": 3.73046875, + "logps/chosen": -1099.0, + "logps/rejected": -694.0, + "loss": 0.2913, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.875, + "rewards/margins": 9.75, + "rewards/rejected": -5.875, + "step": 4111 + }, + { + "epoch": 0.815913487772211, + "grad_norm": 35.505690464618276, + "learning_rate": 1.899772956597973e-07, + "logits/chosen": 3.33203125, + "logits/rejected": 3.79296875, + "logps/chosen": -802.0, + "logps/rejected": -1684.5, + "loss": 0.3848, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.69140625, + "rewards/margins": 11.265625, + "rewards/rejected": -9.5859375, + "step": 4112 + }, + { + "epoch": 0.8161119103130116, + "grad_norm": 34.58424508944758, + "learning_rate": 1.897904034285449e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.984375, + "logps/chosen": -1130.0, + "logps/rejected": -761.0, + "loss": 0.3245, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.78125, + "rewards/margins": 8.6796875, + "rewards/rejected": -5.8984375, + "step": 4113 + }, + { + "epoch": 0.8163103328538122, + "grad_norm": 36.196035282291035, + "learning_rate": 1.8960368398343746e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 4.328125, + "logps/chosen": -835.0, + "logps/rejected": -1106.0, + "loss": 0.3541, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8671875, + "rewards/margins": 11.953125, + "rewards/rejected": -9.09375, + "step": 4114 + }, + { + "epoch": 0.8165087553946129, + "grad_norm": 29.994647422551253, + "learning_rate": 1.8941713741404106e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 4.09765625, + "logps/chosen": -1019.0, + "logps/rejected": -880.0, + "loss": 0.3304, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.490966796875, + "rewards/margins": 9.625, + "rewards/rejected": -7.15625, + "step": 4115 + }, + { + "epoch": 0.8167071779354135, + "grad_norm": 30.2561661839479, + "learning_rate": 1.8923076380983848e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.96875, + "logps/chosen": -1196.0, + "logps/rejected": -1567.0, + "loss": 0.3854, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.697265625, + "rewards/margins": 9.40625, + "rewards/rejected": -6.71484375, + "step": 4116 + }, + { + "epoch": 0.8169056004762141, + "grad_norm": 38.25255931448115, + "learning_rate": 1.8904456326023027e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 4.04296875, + "logps/chosen": -901.0, + "logps/rejected": -1474.0, + "loss": 0.4501, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.111328125, + "rewards/margins": 7.265625, + "rewards/rejected": -5.15625, + "step": 4117 + }, + { + "epoch": 0.8171040230170147, + "grad_norm": 37.10238304610597, + "learning_rate": 1.8885853585453328e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 4.0703125, + "logps/chosen": -1172.0, + "logps/rejected": -856.0, + "loss": 0.4919, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.912109375, + "rewards/margins": 7.5546875, + "rewards/rejected": -5.6484375, + "step": 4118 + }, + { + "epoch": 0.8173024455578154, + "grad_norm": 26.39999700006483, + "learning_rate": 1.8867268168198163e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.15625, + "logps/chosen": -1066.0, + "logps/rejected": -739.0, + "loss": 0.3663, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9765625, + "rewards/margins": 8.234375, + "rewards/rejected": -5.265625, + "step": 4119 + }, + { + "epoch": 0.817500868098616, + "grad_norm": 26.223305528833347, + "learning_rate": 1.8848700083172615e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 4.1953125, + "logps/chosen": -1091.0, + "logps/rejected": -786.0, + "loss": 0.4192, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.1865234375, + "rewards/margins": 7.75, + "rewards/rejected": -5.5546875, + "step": 4120 + }, + { + "epoch": 0.8176992906394166, + "grad_norm": 21.61581881737017, + "learning_rate": 1.883014933928348e-07, + "logits/chosen": 3.453125, + "logits/rejected": 3.5, + "logps/chosen": -1052.0, + "logps/rejected": -664.5, + "loss": 0.2691, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.990234375, + "rewards/margins": 8.5546875, + "rewards/rejected": -5.5859375, + "step": 4121 + }, + { + "epoch": 0.8178977131802173, + "grad_norm": 32.39279013853062, + "learning_rate": 1.881161594542919e-07, + "logits/chosen": 4.29296875, + "logits/rejected": 4.203125, + "logps/chosen": -1326.0, + "logps/rejected": -980.0, + "loss": 0.4425, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.68359375, + "rewards/margins": 9.015625, + "rewards/rejected": -5.3203125, + "step": 4122 + }, + { + "epoch": 0.8180961357210179, + "grad_norm": 18.14080804421951, + "learning_rate": 1.8793099910499925e-07, + "logits/chosen": 4.46484375, + "logits/rejected": 4.25, + "logps/chosen": -1305.0, + "logps/rejected": -858.0, + "loss": 0.2547, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.828125, + "rewards/margins": 9.78125, + "rewards/rejected": -5.953125, + "step": 4123 + }, + { + "epoch": 0.8182945582618185, + "grad_norm": 34.65666098762667, + "learning_rate": 1.8774601243377474e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 4.10546875, + "logps/chosen": -1144.0, + "logps/rejected": -920.0, + "loss": 0.3959, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.12890625, + "rewards/margins": 9.625, + "rewards/rejected": -6.4921875, + "step": 4124 + }, + { + "epoch": 0.8184929808026192, + "grad_norm": 29.469282817946027, + "learning_rate": 1.8756119952935335e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 3.98046875, + "logps/chosen": -861.0, + "logps/rejected": -656.0, + "loss": 0.2908, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7607421875, + "rewards/margins": 9.4140625, + "rewards/rejected": -6.6640625, + "step": 4125 + }, + { + "epoch": 0.8186914033434198, + "grad_norm": 29.024525494244592, + "learning_rate": 1.8737656048038636e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.23046875, + "logps/chosen": -733.5, + "logps/rejected": -715.5, + "loss": 0.3849, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5078125, + "rewards/margins": 7.375, + "rewards/rejected": -4.857421875, + "step": 4126 + }, + { + "epoch": 0.8188898258842204, + "grad_norm": 32.23545167053801, + "learning_rate": 1.8719209537544195e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.53515625, + "logps/chosen": -866.0, + "logps/rejected": -1888.0, + "loss": 0.4884, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.259765625, + "rewards/margins": 18.1328125, + "rewards/rejected": -15.90625, + "step": 4127 + }, + { + "epoch": 0.819088248425021, + "grad_norm": 27.06864820313221, + "learning_rate": 1.8700780430300476e-07, + "logits/chosen": 4.4140625, + "logits/rejected": 4.34375, + "logps/chosen": -1032.0, + "logps/rejected": -839.0, + "loss": 0.3832, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.392578125, + "rewards/margins": 9.90625, + "rewards/rejected": -6.51171875, + "step": 4128 + }, + { + "epoch": 0.8192866709658218, + "grad_norm": 25.252794685166663, + "learning_rate": 1.8682368735147575e-07, + "logits/chosen": 3.828125, + "logits/rejected": 4.0546875, + "logps/chosen": -963.0, + "logps/rejected": -663.5, + "loss": 0.4424, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.44921875, + "rewards/margins": 8.2890625, + "rewards/rejected": -5.84375, + "step": 4129 + }, + { + "epoch": 0.8194850935066224, + "grad_norm": 31.245326574684118, + "learning_rate": 1.8663974460917297e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.1328125, + "logps/chosen": -1137.0, + "logps/rejected": -735.5, + "loss": 0.3975, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0859375, + "rewards/margins": 9.1171875, + "rewards/rejected": -6.0390625, + "step": 4130 + }, + { + "epoch": 0.819683516047423, + "grad_norm": 27.599481115636483, + "learning_rate": 1.8645597616433e-07, + "logits/chosen": 3.21484375, + "logits/rejected": 3.1484375, + "logps/chosen": -990.0, + "logps/rejected": -680.0, + "loss": 0.3782, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.400390625, + "rewards/margins": 8.4140625, + "rewards/rejected": -6.0078125, + "step": 4131 + }, + { + "epoch": 0.8198819385882237, + "grad_norm": 38.395893255889206, + "learning_rate": 1.8627238210509765e-07, + "logits/chosen": 3.91015625, + "logits/rejected": 4.140625, + "logps/chosen": -1013.0, + "logps/rejected": -1230.0, + "loss": 0.4416, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.98046875, + "rewards/margins": 9.890625, + "rewards/rejected": -7.9140625, + "step": 4132 + }, + { + "epoch": 0.8200803611290243, + "grad_norm": 25.083211533922416, + "learning_rate": 1.8608896251954265e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 4.34765625, + "logps/chosen": -937.5, + "logps/rejected": -876.5, + "loss": 0.3409, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.77734375, + "rewards/margins": 9.6953125, + "rewards/rejected": -6.9453125, + "step": 4133 + }, + { + "epoch": 0.8202787836698249, + "grad_norm": 26.825445507941026, + "learning_rate": 1.8590571749564792e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.078125, + "logps/chosen": -1025.0, + "logps/rejected": -749.0, + "loss": 0.2704, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.830078125, + "rewards/margins": 9.5, + "rewards/rejected": -6.6640625, + "step": 4134 + }, + { + "epoch": 0.8204772062106255, + "grad_norm": 29.61055927308863, + "learning_rate": 1.8572264712131321e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 3.65625, + "logps/chosen": -1105.0, + "logps/rejected": -802.0, + "loss": 0.3075, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.521484375, + "rewards/margins": 8.65625, + "rewards/rejected": -6.134765625, + "step": 4135 + }, + { + "epoch": 0.8206756287514262, + "grad_norm": 31.069996040476877, + "learning_rate": 1.8553975148435374e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.61328125, + "logps/chosen": -839.0, + "logps/rejected": -495.5, + "loss": 0.3362, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.744140625, + "rewards/margins": 7.09765625, + "rewards/rejected": -4.35546875, + "step": 4136 + }, + { + "epoch": 0.8208740512922268, + "grad_norm": 28.937645867590355, + "learning_rate": 1.8535703067250153e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 4.01953125, + "logps/chosen": -1128.0, + "logps/rejected": -890.0, + "loss": 0.4978, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.830078125, + "rewards/margins": 7.609375, + "rewards/rejected": -4.787109375, + "step": 4137 + }, + { + "epoch": 0.8210724738330274, + "grad_norm": 29.5123245008229, + "learning_rate": 1.8517448477340453e-07, + "logits/chosen": 4.4921875, + "logits/rejected": 4.2734375, + "logps/chosen": -1151.0, + "logps/rejected": -735.0, + "loss": 0.3196, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0390625, + "rewards/margins": 8.78125, + "rewards/rejected": -5.7578125, + "step": 4138 + }, + { + "epoch": 0.8212708963738281, + "grad_norm": 26.811645167657705, + "learning_rate": 1.8499211387462653e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 3.94921875, + "logps/chosen": -782.0, + "logps/rejected": -1271.0, + "loss": 0.441, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.18359375, + "rewards/margins": 8.3828125, + "rewards/rejected": -6.1953125, + "step": 4139 + }, + { + "epoch": 0.8214693189146287, + "grad_norm": 37.17713122737517, + "learning_rate": 1.8480991806364798e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.34375, + "logps/chosen": -1011.5, + "logps/rejected": -803.0, + "loss": 0.3848, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.744140625, + "rewards/margins": 14.1953125, + "rewards/rejected": -11.421875, + "step": 4140 + }, + { + "epoch": 0.8216677414554293, + "grad_norm": 30.75898228465852, + "learning_rate": 1.8462789742786456e-07, + "logits/chosen": 3.5546875, + "logits/rejected": 3.703125, + "logps/chosen": -904.5, + "logps/rejected": -943.0, + "loss": 0.3864, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.728515625, + "rewards/margins": 9.7265625, + "rewards/rejected": -7.0, + "step": 4141 + }, + { + "epoch": 0.82186616399623, + "grad_norm": 30.213374538851618, + "learning_rate": 1.8444605205458867e-07, + "logits/chosen": 3.8046875, + "logits/rejected": 3.63671875, + "logps/chosen": -866.0, + "logps/rejected": -801.0, + "loss": 0.5544, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.31640625, + "rewards/margins": 6.6796875, + "rewards/rejected": -4.369140625, + "step": 4142 + }, + { + "epoch": 0.8220645865370306, + "grad_norm": 27.553634050146865, + "learning_rate": 1.842643820310481e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.296875, + "logps/chosen": -987.0, + "logps/rejected": -872.0, + "loss": 0.4048, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.435546875, + "rewards/margins": 8.265625, + "rewards/rejected": -5.828125, + "step": 4143 + }, + { + "epoch": 0.8222630090778312, + "grad_norm": 32.42352540234842, + "learning_rate": 1.8408288744438684e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.04296875, + "logps/chosen": -823.0, + "logps/rejected": -591.5, + "loss": 0.4605, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.40234375, + "rewards/margins": 7.03125, + "rewards/rejected": -4.62890625, + "step": 4144 + }, + { + "epoch": 0.8224614316186318, + "grad_norm": 32.475031694400506, + "learning_rate": 1.8390156838166462e-07, + "logits/chosen": 4.359375, + "logits/rejected": 4.3984375, + "logps/chosen": -1198.0, + "logps/rejected": -925.0, + "loss": 0.3852, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.6015625, + "rewards/margins": 10.0, + "rewards/rejected": -6.41015625, + "step": 4145 + }, + { + "epoch": 0.8226598541594325, + "grad_norm": 35.70122015004024, + "learning_rate": 1.8372042492985695e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.28515625, + "logps/chosen": -1163.0, + "logps/rejected": -1127.0, + "loss": 0.3565, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.638671875, + "rewards/margins": 12.8984375, + "rewards/rejected": -10.28125, + "step": 4146 + }, + { + "epoch": 0.8228582767002331, + "grad_norm": 31.307654042028123, + "learning_rate": 1.8353945717585508e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.59375, + "logps/chosen": -986.0, + "logps/rejected": -574.0, + "loss": 0.4371, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.478515625, + "rewards/margins": 7.2734375, + "rewards/rejected": -4.78125, + "step": 4147 + }, + { + "epoch": 0.8230566992410338, + "grad_norm": 23.235829340857787, + "learning_rate": 1.8335866520646597e-07, + "logits/chosen": 3.72265625, + "logits/rejected": 3.9921875, + "logps/chosen": -1074.0, + "logps/rejected": -1404.0, + "loss": 0.4254, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.50390625, + "rewards/margins": 11.25, + "rewards/rejected": -7.78125, + "step": 4148 + }, + { + "epoch": 0.8232551217818345, + "grad_norm": 35.59644782901212, + "learning_rate": 1.8317804910841256e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.98828125, + "logps/chosen": -1015.0, + "logps/rejected": -1069.0, + "loss": 0.4529, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.046875, + "rewards/margins": 9.0234375, + "rewards/rejected": -5.984375, + "step": 4149 + }, + { + "epoch": 0.8234535443226351, + "grad_norm": 31.05013920830506, + "learning_rate": 1.8299760896833295e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.03515625, + "logps/chosen": -1131.0, + "logps/rejected": -1494.0, + "loss": 0.3183, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.625, + "rewards/margins": 12.0625, + "rewards/rejected": -8.4140625, + "step": 4150 + }, + { + "epoch": 0.8236519668634357, + "grad_norm": 37.34913307561456, + "learning_rate": 1.8281734487278115e-07, + "logits/chosen": 3.703125, + "logits/rejected": 3.3515625, + "logps/chosen": -470.5, + "logps/rejected": -416.0, + "loss": 0.612, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.0888671875, + "rewards/margins": 3.408203125, + "rewards/rejected": -2.3193359375, + "step": 4151 + }, + { + "epoch": 0.8238503894042363, + "grad_norm": 40.32501228740932, + "learning_rate": 1.8263725690822674e-07, + "logits/chosen": 3.08984375, + "logits/rejected": 3.03125, + "logps/chosen": -526.5, + "logps/rejected": -434.5, + "loss": 0.4165, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.5, + "rewards/margins": 5.828125, + "rewards/rejected": -4.328125, + "step": 4152 + }, + { + "epoch": 0.824048811945037, + "grad_norm": 25.572413628653866, + "learning_rate": 1.8245734516105454e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 3.89453125, + "logps/chosen": -865.0, + "logps/rejected": -751.0, + "loss": 0.2769, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.2734375, + "rewards/margins": 9.7578125, + "rewards/rejected": -7.484375, + "step": 4153 + }, + { + "epoch": 0.8242472344858376, + "grad_norm": 41.91311370468186, + "learning_rate": 1.8227760971756504e-07, + "logits/chosen": 4.1875, + "logits/rejected": 3.9453125, + "logps/chosen": -982.5, + "logps/rejected": -519.5, + "loss": 0.3489, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.41363525390625, + "rewards/margins": 7.0625, + "rewards/rejected": -4.640625, + "step": 4154 + }, + { + "epoch": 0.8244456570266382, + "grad_norm": 31.037967210025347, + "learning_rate": 1.8209805066397432e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.8125, + "logps/chosen": -994.0, + "logps/rejected": -599.0, + "loss": 0.3727, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6953125, + "rewards/margins": 7.4921875, + "rewards/rejected": -4.796875, + "step": 4155 + }, + { + "epoch": 0.8246440795674389, + "grad_norm": 29.994564271699325, + "learning_rate": 1.8191866808641355e-07, + "logits/chosen": 4.34765625, + "logits/rejected": 4.609375, + "logps/chosen": -1008.5, + "logps/rejected": -960.0, + "loss": 0.5621, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.08984375, + "rewards/margins": 7.546142578125, + "rewards/rejected": -4.466796875, + "step": 4156 + }, + { + "epoch": 0.8248425021082395, + "grad_norm": 22.76989363897893, + "learning_rate": 1.8173946207092934e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.94140625, + "logps/chosen": -1155.0, + "logps/rejected": -764.0, + "loss": 0.3317, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.103515625, + "rewards/margins": 8.5078125, + "rewards/rejected": -5.41015625, + "step": 4157 + }, + { + "epoch": 0.8250409246490401, + "grad_norm": 32.829333704239495, + "learning_rate": 1.815604327034837e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.08203125, + "logps/chosen": -917.0, + "logps/rejected": -601.0, + "loss": 0.385, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.091796875, + "rewards/margins": 6.8671875, + "rewards/rejected": -4.76953125, + "step": 4158 + }, + { + "epoch": 0.8252393471898407, + "grad_norm": 29.860812473062687, + "learning_rate": 1.8138158006995363e-07, + "logits/chosen": 4.09375, + "logits/rejected": 4.06640625, + "logps/chosen": -1117.5, + "logps/rejected": -733.0, + "loss": 0.2867, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.6328125, + "rewards/margins": 9.65625, + "rewards/rejected": -6.01953125, + "step": 4159 + }, + { + "epoch": 0.8254377697306414, + "grad_norm": 33.15178007466884, + "learning_rate": 1.8120290425613202e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 3.98828125, + "logps/chosen": -1039.0, + "logps/rejected": -783.0, + "loss": 0.3343, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0859375, + "rewards/margins": 8.21875, + "rewards/rejected": -5.12890625, + "step": 4160 + }, + { + "epoch": 0.825636192271442, + "grad_norm": 24.89574297385552, + "learning_rate": 1.8102440534772596e-07, + "logits/chosen": 3.85546875, + "logits/rejected": 4.0, + "logps/chosen": -1004.0, + "logps/rejected": -1077.5, + "loss": 0.4061, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.98828125, + "rewards/margins": 8.8828125, + "rewards/rejected": -5.90234375, + "step": 4161 + }, + { + "epoch": 0.8258346148122426, + "grad_norm": 33.896919357427464, + "learning_rate": 1.8084608343035853e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 4.08984375, + "logps/chosen": -1232.0, + "logps/rejected": -824.0, + "loss": 0.357, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.61328125, + "rewards/margins": 8.734375, + "rewards/rejected": -5.12109375, + "step": 4162 + }, + { + "epoch": 0.8260330373530433, + "grad_norm": 43.786347367581754, + "learning_rate": 1.8066793858956752e-07, + "logits/chosen": 3.5, + "logits/rejected": 3.38671875, + "logps/chosen": -1118.0, + "logps/rejected": -931.0, + "loss": 0.287, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1484375, + "rewards/margins": 9.40625, + "rewards/rejected": -6.265625, + "step": 4163 + }, + { + "epoch": 0.826231459893844, + "grad_norm": 29.434305239093504, + "learning_rate": 1.8048997091080568e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.296875, + "logps/chosen": -1140.0, + "logps/rejected": -2256.0, + "loss": 0.3213, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.548828125, + "rewards/margins": 13.5234375, + "rewards/rejected": -10.9921875, + "step": 4164 + }, + { + "epoch": 0.8264298824346445, + "grad_norm": 25.536428231086145, + "learning_rate": 1.803121804794413e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 3.8828125, + "logps/chosen": -864.0, + "logps/rejected": -600.5, + "loss": 0.3918, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.046875, + "rewards/margins": 8.2734375, + "rewards/rejected": -5.2109375, + "step": 4165 + }, + { + "epoch": 0.8266283049754453, + "grad_norm": 29.434058476402935, + "learning_rate": 1.8013456738075688e-07, + "logits/chosen": 4.52734375, + "logits/rejected": 4.1171875, + "logps/chosen": -927.0, + "logps/rejected": -670.0, + "loss": 0.3527, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.546875, + "rewards/margins": 7.3828125, + "rewards/rejected": -4.8359375, + "step": 4166 + }, + { + "epoch": 0.8268267275162459, + "grad_norm": 38.79080391248796, + "learning_rate": 1.7995713169995057e-07, + "logits/chosen": 4.37890625, + "logits/rejected": 4.203125, + "logps/chosen": -1414.0, + "logps/rejected": -844.0, + "loss": 0.3536, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.28125, + "rewards/margins": 9.84375, + "rewards/rejected": -6.5625, + "step": 4167 + }, + { + "epoch": 0.8270251500570465, + "grad_norm": 28.194559054448476, + "learning_rate": 1.7977987352213496e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 4.19140625, + "logps/chosen": -830.0, + "logps/rejected": -689.0, + "loss": 0.466, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.044921875, + "rewards/margins": 6.1328125, + "rewards/rejected": -4.08984375, + "step": 4168 + }, + { + "epoch": 0.8272235725978471, + "grad_norm": 23.78472953637277, + "learning_rate": 1.796027929323378e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.26953125, + "logps/chosen": -1065.5, + "logps/rejected": -891.5, + "loss": 0.3378, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.01171875, + "rewards/margins": 8.765625, + "rewards/rejected": -5.7421875, + "step": 4169 + }, + { + "epoch": 0.8274219951386478, + "grad_norm": 33.86926892291866, + "learning_rate": 1.7942589001550134e-07, + "logits/chosen": 4.203125, + "logits/rejected": 3.7109375, + "logps/chosen": -991.0, + "logps/rejected": -935.5, + "loss": 0.3691, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.9873046875, + "rewards/margins": 7.46875, + "rewards/rejected": -5.47265625, + "step": 4170 + }, + { + "epoch": 0.8276204176794484, + "grad_norm": 29.650194621560484, + "learning_rate": 1.7924916485648274e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.109375, + "logps/chosen": -1083.0, + "logps/rejected": -911.0, + "loss": 0.5128, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.578125, + "rewards/margins": 6.2265625, + "rewards/rejected": -4.6484375, + "step": 4171 + }, + { + "epoch": 0.827818840220249, + "grad_norm": 36.118935284147355, + "learning_rate": 1.790726175400542e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.92578125, + "logps/chosen": -970.0, + "logps/rejected": -731.0, + "loss": 0.405, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.865234375, + "rewards/margins": 7.0, + "rewards/rejected": -4.1484375, + "step": 4172 + }, + { + "epoch": 0.8280172627610497, + "grad_norm": 36.25257895347228, + "learning_rate": 1.7889624815090195e-07, + "logits/chosen": 3.796875, + "logits/rejected": 3.74609375, + "logps/chosen": -939.0, + "logps/rejected": -731.0, + "loss": 0.4929, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.93994140625, + "rewards/margins": 6.52734375, + "rewards/rejected": -4.578125, + "step": 4173 + }, + { + "epoch": 0.8282156853018503, + "grad_norm": 33.23315399350958, + "learning_rate": 1.7872005677362756e-07, + "logits/chosen": 3.83984375, + "logits/rejected": 3.90234375, + "logps/chosen": -793.0, + "logps/rejected": -566.0, + "loss": 0.5001, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.1865234375, + "rewards/margins": 6.2890625, + "rewards/rejected": -4.1015625, + "step": 4174 + }, + { + "epoch": 0.8284141078426509, + "grad_norm": 25.51923371704487, + "learning_rate": 1.7854404349274676e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 3.57421875, + "logps/chosen": -1276.0, + "logps/rejected": -709.5, + "loss": 0.2604, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.146484375, + "rewards/margins": 8.3671875, + "rewards/rejected": -5.21875, + "step": 4175 + }, + { + "epoch": 0.8286125303834515, + "grad_norm": 20.284098253644192, + "learning_rate": 1.7836820839269007e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.140625, + "logps/chosen": -1122.0, + "logps/rejected": -601.5, + "loss": 0.2805, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2890625, + "rewards/margins": 9.7734375, + "rewards/rejected": -6.484375, + "step": 4176 + }, + { + "epoch": 0.8288109529242522, + "grad_norm": 36.920999471738384, + "learning_rate": 1.7819255155780238e-07, + "logits/chosen": 4.5625, + "logits/rejected": 4.5234375, + "logps/chosen": -863.0, + "logps/rejected": -665.0, + "loss": 0.359, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.75390625, + "rewards/margins": 8.359375, + "rewards/rejected": -5.59375, + "step": 4177 + }, + { + "epoch": 0.8290093754650528, + "grad_norm": 29.983981835177968, + "learning_rate": 1.780170730723432e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.1484375, + "logps/chosen": -1247.0, + "logps/rejected": -865.0, + "loss": 0.4005, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.21484375, + "rewards/margins": 8.21875, + "rewards/rejected": -5.00390625, + "step": 4178 + }, + { + "epoch": 0.8292077980058534, + "grad_norm": 26.079146239763872, + "learning_rate": 1.7784177302048627e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.05078125, + "logps/chosen": -999.0, + "logps/rejected": -720.0, + "loss": 0.3739, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4443359375, + "rewards/margins": 7.66015625, + "rewards/rejected": -5.21875, + "step": 4179 + }, + { + "epoch": 0.8294062205466541, + "grad_norm": 36.436409038442065, + "learning_rate": 1.7766665148632026e-07, + "logits/chosen": 3.44140625, + "logits/rejected": 3.3125, + "logps/chosen": -905.0, + "logps/rejected": -528.5, + "loss": 0.5037, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.686279296875, + "rewards/margins": 5.60546875, + "rewards/rejected": -3.9375, + "step": 4180 + }, + { + "epoch": 0.8296046430874547, + "grad_norm": 36.70816657084724, + "learning_rate": 1.7749170855384764e-07, + "logits/chosen": 4.328125, + "logits/rejected": 4.28125, + "logps/chosen": -1429.0, + "logps/rejected": -1095.0, + "loss": 0.2739, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.814453125, + "rewards/margins": 12.03125, + "rewards/rejected": -8.2421875, + "step": 4181 + }, + { + "epoch": 0.8298030656282553, + "grad_norm": 31.467179240057863, + "learning_rate": 1.7731694430698547e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.140625, + "logps/chosen": -1096.0, + "logps/rejected": -1226.5, + "loss": 0.2113, + "rewards/accuracies": 0.96875, + "rewards/chosen": 4.25, + "rewards/margins": 11.8125, + "rewards/rejected": -7.578125, + "step": 4182 + }, + { + "epoch": 0.8300014881690561, + "grad_norm": 25.742540934914494, + "learning_rate": 1.7714235882956502e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 3.93359375, + "logps/chosen": -932.0, + "logps/rejected": -760.0, + "loss": 0.55, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.89453125, + "rewards/margins": 7.3203125, + "rewards/rejected": -5.421875, + "step": 4183 + }, + { + "epoch": 0.8301999107098567, + "grad_norm": 22.09890822934555, + "learning_rate": 1.769679522053318e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 4.203125, + "logps/chosen": -982.0, + "logps/rejected": -1485.0, + "loss": 0.3545, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.767578125, + "rewards/margins": 11.390625, + "rewards/rejected": -8.62890625, + "step": 4184 + }, + { + "epoch": 0.8303983332506573, + "grad_norm": 31.692085482613585, + "learning_rate": 1.7679372451794584e-07, + "logits/chosen": 3.76171875, + "logits/rejected": 3.5546875, + "logps/chosen": -1035.0, + "logps/rejected": -762.5, + "loss": 0.2964, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.392578125, + "rewards/margins": 8.7578125, + "rewards/rejected": -6.359375, + "step": 4185 + }, + { + "epoch": 0.8305967557914579, + "grad_norm": 36.689519602652744, + "learning_rate": 1.7661967585098064e-07, + "logits/chosen": 3.9375, + "logits/rejected": 4.25390625, + "logps/chosen": -1213.0, + "logps/rejected": -948.0, + "loss": 0.3386, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6005859375, + "rewards/margins": 8.5078125, + "rewards/rejected": -5.9296875, + "step": 4186 + }, + { + "epoch": 0.8307951783322586, + "grad_norm": 31.759271897927356, + "learning_rate": 1.7644580628792455e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 3.99609375, + "logps/chosen": -837.0, + "logps/rejected": -804.0, + "loss": 0.5275, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.73046875, + "rewards/margins": 12.828125, + "rewards/rejected": -11.05859375, + "step": 4187 + }, + { + "epoch": 0.8309936008730592, + "grad_norm": 34.83363961253479, + "learning_rate": 1.7627211591217965e-07, + "logits/chosen": 3.5, + "logits/rejected": 3.62109375, + "logps/chosen": -1087.0, + "logps/rejected": -810.5, + "loss": 0.4676, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.75390625, + "rewards/margins": 7.8359375, + "rewards/rejected": -5.0859375, + "step": 4188 + }, + { + "epoch": 0.8311920234138598, + "grad_norm": 27.876874461207542, + "learning_rate": 1.7609860480706206e-07, + "logits/chosen": 3.8359375, + "logits/rejected": 3.65625, + "logps/chosen": -995.0, + "logps/rejected": -778.0, + "loss": 0.3525, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.75390625, + "rewards/margins": 9.6171875, + "rewards/rejected": -6.85546875, + "step": 4189 + }, + { + "epoch": 0.8313904459546605, + "grad_norm": 33.35489276973032, + "learning_rate": 1.7592527305580223e-07, + "logits/chosen": 4.25, + "logits/rejected": 4.109375, + "logps/chosen": -1130.0, + "logps/rejected": -759.0, + "loss": 0.438, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.837890625, + "rewards/margins": 12.5390625, + "rewards/rejected": -9.69140625, + "step": 4190 + }, + { + "epoch": 0.8315888684954611, + "grad_norm": 20.31066911288958, + "learning_rate": 1.757521207415439e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 4.0234375, + "logps/chosen": -1264.0, + "logps/rejected": -747.0, + "loss": 0.434, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8125, + "rewards/margins": 8.2265625, + "rewards/rejected": -5.3984375, + "step": 4191 + }, + { + "epoch": 0.8317872910362617, + "grad_norm": 26.623993572493227, + "learning_rate": 1.7557914794734557e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 3.6953125, + "logps/chosen": -1262.0, + "logps/rejected": -641.0, + "loss": 0.3227, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.2578125, + "rewards/margins": 9.578125, + "rewards/rejected": -6.3046875, + "step": 4192 + }, + { + "epoch": 0.8319857135770623, + "grad_norm": 25.232567024101932, + "learning_rate": 1.7540635475617883e-07, + "logits/chosen": 3.48046875, + "logits/rejected": 3.6171875, + "logps/chosen": -793.0, + "logps/rejected": -589.0, + "loss": 0.4688, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.4453125, + "rewards/margins": 8.21484375, + "rewards/rejected": -5.78515625, + "step": 4193 + }, + { + "epoch": 0.832184136117863, + "grad_norm": 27.591204602960985, + "learning_rate": 1.7523374125092978e-07, + "logits/chosen": 3.859375, + "logits/rejected": 4.046875, + "logps/chosen": -713.0, + "logps/rejected": -631.0, + "loss": 0.4422, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.28125, + "rewards/margins": 8.0625, + "rewards/rejected": -6.7890625, + "step": 4194 + }, + { + "epoch": 0.8323825586586636, + "grad_norm": 28.05257734400123, + "learning_rate": 1.75061307514398e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 4.0078125, + "logps/chosen": -986.0, + "logps/rejected": -864.0, + "loss": 0.3944, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.697265625, + "rewards/margins": 9.0390625, + "rewards/rejected": -6.328125, + "step": 4195 + }, + { + "epoch": 0.8325809811994642, + "grad_norm": 29.091333919429573, + "learning_rate": 1.7488905362929678e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.015625, + "logps/chosen": -785.0, + "logps/rejected": -978.5, + "loss": 0.3866, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.244140625, + "rewards/margins": 9.2734375, + "rewards/rejected": -7.0390625, + "step": 4196 + }, + { + "epoch": 0.8327794037402649, + "grad_norm": 38.90302993238457, + "learning_rate": 1.7471697967825355e-07, + "logits/chosen": 3.6328125, + "logits/rejected": 3.6796875, + "logps/chosen": -939.0, + "logps/rejected": -683.5, + "loss": 0.5846, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.135009765625, + "rewards/margins": 6.546875, + "rewards/rejected": -4.40625, + "step": 4197 + }, + { + "epoch": 0.8329778262810655, + "grad_norm": 30.250959660052125, + "learning_rate": 1.7454508574380872e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 3.99609375, + "logps/chosen": -805.5, + "logps/rejected": -678.0, + "loss": 0.4017, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.42578125, + "rewards/margins": 8.3359375, + "rewards/rejected": -5.8984375, + "step": 4198 + }, + { + "epoch": 0.8331762488218661, + "grad_norm": 33.222598078696535, + "learning_rate": 1.74373371908417e-07, + "logits/chosen": 3.7265625, + "logits/rejected": 3.90234375, + "logps/chosen": -649.5, + "logps/rejected": -714.0, + "loss": 0.4757, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.7421875, + "rewards/margins": 7.578125, + "rewards/rejected": -5.83203125, + "step": 4199 + }, + { + "epoch": 0.8333746713626669, + "grad_norm": 26.002196231297738, + "learning_rate": 1.7420183825444646e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 4.01171875, + "logps/chosen": -1266.0, + "logps/rejected": -1027.0, + "loss": 0.3894, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.379150390625, + "rewards/margins": 10.1796875, + "rewards/rejected": -7.8125, + "step": 4200 + }, + { + "epoch": 0.8335730939034675, + "grad_norm": 41.00645634356432, + "learning_rate": 1.7403048486417868e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 3.984375, + "logps/chosen": -981.0, + "logps/rejected": -825.0, + "loss": 0.3857, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.8818359375, + "rewards/margins": 8.53125, + "rewards/rejected": -6.6328125, + "step": 4201 + }, + { + "epoch": 0.8337715164442681, + "grad_norm": 33.54485874865107, + "learning_rate": 1.7385931181980884e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.21484375, + "logps/chosen": -1002.0, + "logps/rejected": -807.0, + "loss": 0.3962, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.89453125, + "rewards/margins": 7.765625, + "rewards/rejected": -4.875, + "step": 4202 + }, + { + "epoch": 0.8339699389850687, + "grad_norm": 42.74678381723157, + "learning_rate": 1.7368831920344557e-07, + "logits/chosen": 3.859375, + "logits/rejected": 3.67578125, + "logps/chosen": -957.0, + "logps/rejected": -766.0, + "loss": 0.351, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4296875, + "rewards/margins": 8.1015625, + "rewards/rejected": -5.658203125, + "step": 4203 + }, + { + "epoch": 0.8341683615258694, + "grad_norm": 26.90661817701463, + "learning_rate": 1.735175070971111e-07, + "logits/chosen": 4.31640625, + "logits/rejected": 4.39453125, + "logps/chosen": -977.0, + "logps/rejected": -1217.0, + "loss": 0.4579, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.478515625, + "rewards/margins": 9.609375, + "rewards/rejected": -7.15625, + "step": 4204 + }, + { + "epoch": 0.83436678406667, + "grad_norm": 26.662730356092933, + "learning_rate": 1.7334687558274098e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 3.984375, + "logps/chosen": -850.5, + "logps/rejected": -688.5, + "loss": 0.353, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.328125, + "rewards/margins": 7.03125, + "rewards/rejected": -4.70703125, + "step": 4205 + }, + { + "epoch": 0.8345652066074706, + "grad_norm": 31.381473426563055, + "learning_rate": 1.73176424742184e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.2890625, + "logps/chosen": -992.0, + "logps/rejected": -920.5, + "loss": 0.4635, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5224609375, + "rewards/margins": 8.453125, + "rewards/rejected": -5.9609375, + "step": 4206 + }, + { + "epoch": 0.8347636291482713, + "grad_norm": 31.393255775863107, + "learning_rate": 1.7300615465720247e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 4.1015625, + "logps/chosen": -1104.0, + "logps/rejected": -727.0, + "loss": 0.4296, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.88671875, + "rewards/margins": 7.2890625, + "rewards/rejected": -4.40234375, + "step": 4207 + }, + { + "epoch": 0.8349620516890719, + "grad_norm": 26.477511714253577, + "learning_rate": 1.7283606540947187e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 4.01953125, + "logps/chosen": -1231.0, + "logps/rejected": -834.5, + "loss": 0.259, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.900390625, + "rewards/margins": 10.34375, + "rewards/rejected": -7.4453125, + "step": 4208 + }, + { + "epoch": 0.8351604742298725, + "grad_norm": 28.001041857933952, + "learning_rate": 1.7266615708058098e-07, + "logits/chosen": 3.671875, + "logits/rejected": 3.7734375, + "logps/chosen": -1136.0, + "logps/rejected": -881.0, + "loss": 0.3078, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.16796875, + "rewards/margins": 11.296875, + "rewards/rejected": -8.1171875, + "step": 4209 + }, + { + "epoch": 0.8353588967706731, + "grad_norm": 33.302560086693234, + "learning_rate": 1.7249642975203204e-07, + "logits/chosen": 4.36328125, + "logits/rejected": 4.01953125, + "logps/chosen": -997.0, + "logps/rejected": -700.0, + "loss": 0.359, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.16796875, + "rewards/margins": 7.9453125, + "rewards/rejected": -5.7890625, + "step": 4210 + }, + { + "epoch": 0.8355573193114738, + "grad_norm": 29.24184144775441, + "learning_rate": 1.7232688350523983e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.01953125, + "logps/chosen": -859.5, + "logps/rejected": -768.5, + "loss": 0.2961, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.568359375, + "rewards/margins": 10.03125, + "rewards/rejected": -7.46875, + "step": 4211 + }, + { + "epoch": 0.8357557418522744, + "grad_norm": 32.87526644139591, + "learning_rate": 1.7215751842153298e-07, + "logits/chosen": 3.66015625, + "logits/rejected": 3.5859375, + "logps/chosen": -630.0, + "logps/rejected": -447.0, + "loss": 0.5374, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.912109375, + "rewards/margins": 4.79296875, + "rewards/rejected": -2.873046875, + "step": 4212 + }, + { + "epoch": 0.835954164393075, + "grad_norm": 27.052272555504302, + "learning_rate": 1.7198833458215287e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 4.17578125, + "logps/chosen": -876.0, + "logps/rejected": -724.0, + "loss": 0.4426, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.529296875, + "rewards/margins": 7.3046875, + "rewards/rejected": -5.78125, + "step": 4213 + }, + { + "epoch": 0.8361525869338757, + "grad_norm": 27.911691144190517, + "learning_rate": 1.7181933206825394e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.15625, + "logps/chosen": -869.0, + "logps/rejected": -595.0, + "loss": 0.3985, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.640625, + "rewards/margins": 6.83984375, + "rewards/rejected": -4.2099609375, + "step": 4214 + }, + { + "epoch": 0.8363510094746763, + "grad_norm": 35.03031630184719, + "learning_rate": 1.7165051096090366e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 3.91796875, + "logps/chosen": -922.5, + "logps/rejected": -1231.0, + "loss": 0.3257, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.251220703125, + "rewards/margins": 9.8203125, + "rewards/rejected": -7.5625, + "step": 4215 + }, + { + "epoch": 0.8365494320154769, + "grad_norm": 30.755129245457702, + "learning_rate": 1.7148187134108244e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 3.9453125, + "logps/chosen": -722.5, + "logps/rejected": -560.0, + "loss": 0.326, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5, + "rewards/margins": 6.9453125, + "rewards/rejected": -4.43359375, + "step": 4216 + }, + { + "epoch": 0.8367478545562776, + "grad_norm": 48.073426943478395, + "learning_rate": 1.71313413289684e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 3.95703125, + "logps/chosen": -1180.0, + "logps/rejected": -815.0, + "loss": 0.3788, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.96875, + "rewards/margins": 8.140625, + "rewards/rejected": -6.171875, + "step": 4217 + }, + { + "epoch": 0.8369462770970783, + "grad_norm": 40.421732703519226, + "learning_rate": 1.7114513688751432e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 4.0, + "logps/chosen": -637.5, + "logps/rejected": -563.5, + "loss": 0.5214, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.724609375, + "rewards/margins": 4.875, + "rewards/rejected": -3.150390625, + "step": 4218 + }, + { + "epoch": 0.8371446996378789, + "grad_norm": 31.18565825161363, + "learning_rate": 1.7097704221529286e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 4.1484375, + "logps/chosen": -1044.0, + "logps/rejected": -1107.0, + "loss": 0.402, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.615234375, + "rewards/margins": 9.9765625, + "rewards/rejected": -7.3515625, + "step": 4219 + }, + { + "epoch": 0.8373431221786795, + "grad_norm": 28.2989785888987, + "learning_rate": 1.708091293536516e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 4.12109375, + "logps/chosen": -1004.0, + "logps/rejected": -668.0, + "loss": 0.3075, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.03125, + "rewards/margins": 8.34375, + "rewards/rejected": -5.3203125, + "step": 4220 + }, + { + "epoch": 0.8375415447194802, + "grad_norm": 27.80966415240147, + "learning_rate": 1.7064139838313517e-07, + "logits/chosen": 3.78125, + "logits/rejected": 3.78125, + "logps/chosen": -1064.0, + "logps/rejected": -796.5, + "loss": 0.3068, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.455078125, + "rewards/margins": 9.1328125, + "rewards/rejected": -6.6875, + "step": 4221 + }, + { + "epoch": 0.8377399672602808, + "grad_norm": 35.15272228346324, + "learning_rate": 1.704738493842015e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 3.9609375, + "logps/chosen": -978.0, + "logps/rejected": -572.0, + "loss": 0.3337, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8359375, + "rewards/margins": 8.484375, + "rewards/rejected": -5.640625, + "step": 4222 + }, + { + "epoch": 0.8379383898010814, + "grad_norm": 29.21957350501143, + "learning_rate": 1.7030648243722048e-07, + "logits/chosen": 3.91015625, + "logits/rejected": 3.99609375, + "logps/chosen": -1102.0, + "logps/rejected": -674.5, + "loss": 0.3166, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2421875, + "rewards/margins": 9.0703125, + "rewards/rejected": -5.8203125, + "step": 4223 + }, + { + "epoch": 0.8381368123418821, + "grad_norm": 31.895610346818007, + "learning_rate": 1.7013929762247528e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 3.77734375, + "logps/chosen": -1450.0, + "logps/rejected": -1425.0, + "loss": 0.2978, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.84765625, + "rewards/margins": 10.9296875, + "rewards/rejected": -8.078125, + "step": 4224 + }, + { + "epoch": 0.8383352348826827, + "grad_norm": 44.38493952810495, + "learning_rate": 1.6997229502016147e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.203125, + "logps/chosen": -1149.0, + "logps/rejected": -960.0, + "loss": 0.4057, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.98046875, + "rewards/margins": 8.1796875, + "rewards/rejected": -5.185546875, + "step": 4225 + }, + { + "epoch": 0.8385336574234833, + "grad_norm": 24.703587636531637, + "learning_rate": 1.698054747103872e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 4.0546875, + "logps/chosen": -784.0, + "logps/rejected": -587.5, + "loss": 0.4975, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.232421875, + "rewards/margins": 7.48828125, + "rewards/rejected": -5.251953125, + "step": 4226 + }, + { + "epoch": 0.8387320799642839, + "grad_norm": 31.520622823311214, + "learning_rate": 1.6963883677317322e-07, + "logits/chosen": 4.38671875, + "logits/rejected": 4.15625, + "logps/chosen": -901.0, + "logps/rejected": -1453.0, + "loss": 0.4829, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.8505859375, + "rewards/margins": 8.08203125, + "rewards/rejected": -6.23046875, + "step": 4227 + }, + { + "epoch": 0.8389305025050846, + "grad_norm": 31.254362914180405, + "learning_rate": 1.694723812884527e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.08203125, + "logps/chosen": -885.0, + "logps/rejected": -780.0, + "loss": 0.4296, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.87109375, + "rewards/margins": 7.140625, + "rewards/rejected": -5.275390625, + "step": 4228 + }, + { + "epoch": 0.8391289250458852, + "grad_norm": 41.68182439747439, + "learning_rate": 1.693061083360715e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 3.84765625, + "logps/chosen": -1071.0, + "logps/rejected": -731.0, + "loss": 0.3594, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.876953125, + "rewards/margins": 8.5390625, + "rewards/rejected": -5.6640625, + "step": 4229 + }, + { + "epoch": 0.8393273475866858, + "grad_norm": 21.025257962838786, + "learning_rate": 1.6914001799578781e-07, + "logits/chosen": 3.71484375, + "logits/rejected": 3.66796875, + "logps/chosen": -858.0, + "logps/rejected": -614.0, + "loss": 0.417, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.66015625, + "rewards/margins": 7.79296875, + "rewards/rejected": -5.138671875, + "step": 4230 + }, + { + "epoch": 0.8395257701274865, + "grad_norm": 35.93736211795713, + "learning_rate": 1.6897411034727217e-07, + "logits/chosen": 3.984375, + "logits/rejected": 3.79296875, + "logps/chosen": -993.5, + "logps/rejected": -608.0, + "loss": 0.4191, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3671875, + "rewards/margins": 6.96875, + "rewards/rejected": -4.58203125, + "step": 4231 + }, + { + "epoch": 0.8397241926682871, + "grad_norm": 33.56518104357434, + "learning_rate": 1.6880838547010745e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 4.20703125, + "logps/chosen": -687.0, + "logps/rejected": -548.5, + "loss": 0.4911, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.338043212890625, + "rewards/margins": 5.484375, + "rewards/rejected": -4.14453125, + "step": 4232 + }, + { + "epoch": 0.8399226152090877, + "grad_norm": 26.375770358796142, + "learning_rate": 1.686428434437891e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 3.81640625, + "logps/chosen": -863.0, + "logps/rejected": -829.0, + "loss": 0.47, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.916015625, + "rewards/margins": 6.96875, + "rewards/rejected": -5.04296875, + "step": 4233 + }, + { + "epoch": 0.8401210377498883, + "grad_norm": 30.175978577753273, + "learning_rate": 1.6847748434772448e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 4.25, + "logps/chosen": -628.25, + "logps/rejected": -1262.0, + "loss": 0.4943, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.296875, + "rewards/margins": 7.8984375, + "rewards/rejected": -6.59375, + "step": 4234 + }, + { + "epoch": 0.840319460290689, + "grad_norm": 25.183883884533376, + "learning_rate": 1.683123082612336e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 3.921875, + "logps/chosen": -907.0, + "logps/rejected": -639.0, + "loss": 0.4059, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.9736328125, + "rewards/margins": 7.640625, + "rewards/rejected": -5.671875, + "step": 4235 + }, + { + "epoch": 0.8405178828314896, + "grad_norm": 30.14942184939539, + "learning_rate": 1.6814731526354846e-07, + "logits/chosen": 3.8046875, + "logits/rejected": 3.71484375, + "logps/chosen": -1225.0, + "logps/rejected": -739.0, + "loss": 0.3664, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4921875, + "rewards/margins": 7.4140625, + "rewards/rejected": -4.93359375, + "step": 4236 + }, + { + "epoch": 0.8407163053722903, + "grad_norm": 30.37629510536669, + "learning_rate": 1.679825054338132e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.27734375, + "logps/chosen": -843.0, + "logps/rejected": -698.0, + "loss": 0.2578, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.78125, + "rewards/margins": 9.09375, + "rewards/rejected": -6.3203125, + "step": 4237 + }, + { + "epoch": 0.840914727913091, + "grad_norm": 28.550547660308165, + "learning_rate": 1.678178788510842e-07, + "logits/chosen": 4.140625, + "logits/rejected": 3.98046875, + "logps/chosen": -1044.5, + "logps/rejected": -673.5, + "loss": 0.2703, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.09375, + "rewards/margins": 9.0234375, + "rewards/rejected": -5.9296875, + "step": 4238 + }, + { + "epoch": 0.8411131504538916, + "grad_norm": 34.2316915675065, + "learning_rate": 1.6765343559432992e-07, + "logits/chosen": 3.828125, + "logits/rejected": 3.90234375, + "logps/chosen": -678.0, + "logps/rejected": -631.5, + "loss": 0.4631, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.26953125, + "rewards/margins": 5.72265625, + "rewards/rejected": -3.455078125, + "step": 4239 + }, + { + "epoch": 0.8413115729946922, + "grad_norm": 28.215439097805625, + "learning_rate": 1.674891757424309e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 3.890625, + "logps/chosen": -1232.0, + "logps/rejected": -979.5, + "loss": 0.3646, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.3240966796875, + "rewards/margins": 10.046875, + "rewards/rejected": -6.7109375, + "step": 4240 + }, + { + "epoch": 0.8415099955354929, + "grad_norm": 37.399360361722195, + "learning_rate": 1.6732509937417944e-07, + "logits/chosen": 3.7734375, + "logits/rejected": 3.65625, + "logps/chosen": -764.5, + "logps/rejected": -875.0, + "loss": 0.4802, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.451171875, + "rewards/margins": 6.8984375, + "rewards/rejected": -4.4609375, + "step": 4241 + }, + { + "epoch": 0.8417084180762935, + "grad_norm": 35.4532894911628, + "learning_rate": 1.6716120656828052e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.22265625, + "logps/chosen": -1029.0, + "logps/rejected": -833.0, + "loss": 0.4513, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.541015625, + "rewards/margins": 7.4609375, + "rewards/rejected": -4.92578125, + "step": 4242 + }, + { + "epoch": 0.8419068406170941, + "grad_norm": 26.85398695787803, + "learning_rate": 1.6699749740335013e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.2734375, + "logps/chosen": -1189.0, + "logps/rejected": -836.0, + "loss": 0.3644, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9609375, + "rewards/margins": 8.0859375, + "rewards/rejected": -5.1171875, + "step": 4243 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 36.56930849303885, + "learning_rate": 1.6683397195791694e-07, + "logits/chosen": 3.625, + "logits/rejected": 3.64453125, + "logps/chosen": -945.0, + "logps/rejected": -724.0, + "loss": 0.512, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.970703125, + "rewards/margins": 7.1640625, + "rewards/rejected": -4.19140625, + "step": 4244 + }, + { + "epoch": 0.8423036856986954, + "grad_norm": 29.265699796287237, + "learning_rate": 1.6667063031042117e-07, + "logits/chosen": 3.70703125, + "logits/rejected": 3.8359375, + "logps/chosen": -739.0, + "logps/rejected": -629.5, + "loss": 0.4548, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.22607421875, + "rewards/margins": 5.9990234375, + "rewards/rejected": -3.767578125, + "step": 4245 + }, + { + "epoch": 0.842502108239496, + "grad_norm": 27.516736469640616, + "learning_rate": 1.665074725392147e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 3.93359375, + "logps/chosen": -720.0, + "logps/rejected": -543.5, + "loss": 0.4682, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.720703125, + "rewards/margins": 6.2890625, + "rewards/rejected": -4.5703125, + "step": 4246 + }, + { + "epoch": 0.8427005307802966, + "grad_norm": 28.875527646238513, + "learning_rate": 1.6634449872256178e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 4.0703125, + "logps/chosen": -913.0, + "logps/rejected": -699.0, + "loss": 0.2572, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.72265625, + "rewards/margins": 9.265625, + "rewards/rejected": -6.5390625, + "step": 4247 + }, + { + "epoch": 0.8428989533210973, + "grad_norm": 138.66983916641067, + "learning_rate": 1.6618170893863765e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 4.203125, + "logps/chosen": -1139.0, + "logps/rejected": -1090.0, + "loss": 0.3556, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.548828125, + "rewards/margins": 8.65625, + "rewards/rejected": -6.109375, + "step": 4248 + }, + { + "epoch": 0.8430973758618979, + "grad_norm": 30.158677701856476, + "learning_rate": 1.6601910326552998e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.08984375, + "logps/chosen": -943.0, + "logps/rejected": -869.0, + "loss": 0.4486, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.169921875, + "rewards/margins": 6.6875, + "rewards/rejected": -4.515625, + "step": 4249 + }, + { + "epoch": 0.8432957984026985, + "grad_norm": 32.13487790035363, + "learning_rate": 1.6585668178123769e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 3.99609375, + "logps/chosen": -1071.0, + "logps/rejected": -1386.0, + "loss": 0.4097, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.05078125, + "rewards/margins": 12.875, + "rewards/rejected": -10.828125, + "step": 4250 + }, + { + "epoch": 0.8434942209434991, + "grad_norm": 33.64848920167769, + "learning_rate": 1.6569444456367144e-07, + "logits/chosen": 3.8828125, + "logits/rejected": 3.53515625, + "logps/chosen": -1096.0, + "logps/rejected": -663.5, + "loss": 0.3706, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.0791015625, + "rewards/margins": 8.32421875, + "rewards/rejected": -6.2421875, + "step": 4251 + }, + { + "epoch": 0.8436926434842998, + "grad_norm": 29.941840467182413, + "learning_rate": 1.6553239169065374e-07, + "logits/chosen": 3.7265625, + "logits/rejected": 3.515625, + "logps/chosen": -1246.0, + "logps/rejected": -1498.0, + "loss": 0.417, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.46484375, + "rewards/margins": 11.1640625, + "rewards/rejected": -8.7109375, + "step": 4252 + }, + { + "epoch": 0.8438910660251004, + "grad_norm": 24.147899207272726, + "learning_rate": 1.6537052323991824e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 3.96875, + "logps/chosen": -1034.0, + "logps/rejected": -709.0, + "loss": 0.3535, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.880859375, + "rewards/margins": 8.1171875, + "rewards/rejected": -5.234375, + "step": 4253 + }, + { + "epoch": 0.844089488565901, + "grad_norm": 35.50938039663847, + "learning_rate": 1.652088392891105e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.12109375, + "logps/chosen": -1046.0, + "logps/rejected": -720.0, + "loss": 0.3105, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.775390625, + "rewards/margins": 8.15625, + "rewards/rejected": -5.390625, + "step": 4254 + }, + { + "epoch": 0.8442879111067018, + "grad_norm": 21.99831532559212, + "learning_rate": 1.650473399157874e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 4.01953125, + "logps/chosen": -1233.0, + "logps/rejected": -689.0, + "loss": 0.399, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.400390625, + "rewards/margins": 7.2890625, + "rewards/rejected": -4.90625, + "step": 4255 + }, + { + "epoch": 0.8444863336475024, + "grad_norm": 29.200842651713117, + "learning_rate": 1.6488602519741738e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 4.1328125, + "logps/chosen": -1053.0, + "logps/rejected": -787.0, + "loss": 0.3582, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1171875, + "rewards/margins": 8.78125, + "rewards/rejected": -5.671875, + "step": 4256 + }, + { + "epoch": 0.844684756188303, + "grad_norm": 25.971930425975803, + "learning_rate": 1.6472489521138015e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.09375, + "logps/chosen": -792.5, + "logps/rejected": -1222.0, + "loss": 0.4125, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.2783203125, + "rewards/margins": 9.78515625, + "rewards/rejected": -7.4921875, + "step": 4257 + }, + { + "epoch": 0.8448831787291037, + "grad_norm": 37.48433404924111, + "learning_rate": 1.645639500349669e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.12109375, + "logps/chosen": -903.0, + "logps/rejected": -634.0, + "loss": 0.4395, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6796875, + "rewards/margins": 7.4375, + "rewards/rejected": -4.7421875, + "step": 4258 + }, + { + "epoch": 0.8450816012699043, + "grad_norm": 24.801263900129413, + "learning_rate": 1.6440318974538025e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 3.83203125, + "logps/chosen": -1101.0, + "logps/rejected": -697.0, + "loss": 0.4113, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9765625, + "rewards/margins": 7.703125, + "rewards/rejected": -4.734375, + "step": 4259 + }, + { + "epoch": 0.8452800238107049, + "grad_norm": 35.43421119650443, + "learning_rate": 1.6424261441973394e-07, + "logits/chosen": 3.87109375, + "logits/rejected": 3.73828125, + "logps/chosen": -993.0, + "logps/rejected": -632.0, + "loss": 0.4958, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.95947265625, + "rewards/margins": 5.91015625, + "rewards/rejected": -3.9453125, + "step": 4260 + }, + { + "epoch": 0.8454784463515055, + "grad_norm": 27.160492603119202, + "learning_rate": 1.6408222413505323e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.75, + "logps/chosen": -1061.0, + "logps/rejected": -1098.0, + "loss": 0.3634, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.96875, + "rewards/margins": 8.640625, + "rewards/rejected": -5.703125, + "step": 4261 + }, + { + "epoch": 0.8456768688923062, + "grad_norm": 27.191610979708503, + "learning_rate": 1.6392201896827445e-07, + "logits/chosen": 3.65625, + "logits/rejected": 3.59375, + "logps/chosen": -775.0, + "logps/rejected": -1397.0, + "loss": 0.275, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.392578125, + "rewards/margins": 11.984375, + "rewards/rejected": -9.5859375, + "step": 4262 + }, + { + "epoch": 0.8458752914331068, + "grad_norm": 29.089228550736202, + "learning_rate": 1.637619989962452e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 3.84765625, + "logps/chosen": -701.5, + "logps/rejected": -666.5, + "loss": 0.3782, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5078125, + "rewards/margins": 8.0859375, + "rewards/rejected": -5.578125, + "step": 4263 + }, + { + "epoch": 0.8460737139739074, + "grad_norm": 36.58778461493171, + "learning_rate": 1.636021642957242e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.06640625, + "logps/chosen": -853.0, + "logps/rejected": -681.5, + "loss": 0.4299, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.43359375, + "rewards/margins": 7.59375, + "rewards/rejected": -5.14453125, + "step": 4264 + }, + { + "epoch": 0.8462721365147081, + "grad_norm": 31.982855861419267, + "learning_rate": 1.6344251494338117e-07, + "logits/chosen": 3.8046875, + "logits/rejected": 4.0625, + "logps/chosen": -1052.0, + "logps/rejected": -843.0, + "loss": 0.2945, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.94140625, + "rewards/margins": 8.71875, + "rewards/rejected": -5.78125, + "step": 4265 + }, + { + "epoch": 0.8464705590555087, + "grad_norm": 35.08896343360817, + "learning_rate": 1.632830510157972e-07, + "logits/chosen": 3.84375, + "logits/rejected": 4.1796875, + "logps/chosen": -986.0, + "logps/rejected": -1288.0, + "loss": 0.4046, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.40234375, + "rewards/margins": 9.4609375, + "rewards/rejected": -6.0546875, + "step": 4266 + }, + { + "epoch": 0.8466689815963093, + "grad_norm": 28.02234808649541, + "learning_rate": 1.6312377258946436e-07, + "logits/chosen": 4.078125, + "logits/rejected": 3.94921875, + "logps/chosen": -713.0, + "logps/rejected": -617.0, + "loss": 0.3858, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.658203125, + "rewards/margins": 8.6484375, + "rewards/rejected": -5.9921875, + "step": 4267 + }, + { + "epoch": 0.8468674041371099, + "grad_norm": 34.6440433499149, + "learning_rate": 1.6296467974078566e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 3.890625, + "logps/chosen": -808.0, + "logps/rejected": -837.5, + "loss": 0.4358, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.76171875, + "rewards/margins": 7.8359375, + "rewards/rejected": -6.0546875, + "step": 4268 + }, + { + "epoch": 0.8470658266779106, + "grad_norm": 31.294577017618224, + "learning_rate": 1.6280577254607502e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 4.14453125, + "logps/chosen": -964.0, + "logps/rejected": -743.0, + "loss": 0.4131, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.32421875, + "rewards/margins": 7.5390625, + "rewards/rejected": -5.22265625, + "step": 4269 + }, + { + "epoch": 0.8472642492187112, + "grad_norm": 25.434904837828757, + "learning_rate": 1.626470510815575e-07, + "logits/chosen": 3.58203125, + "logits/rejected": 3.61328125, + "logps/chosen": -1007.0, + "logps/rejected": -685.0, + "loss": 0.2928, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.953125, + "rewards/margins": 9.3828125, + "rewards/rejected": -6.4375, + "step": 4270 + }, + { + "epoch": 0.8474626717595118, + "grad_norm": 33.46894938907433, + "learning_rate": 1.6248851542336878e-07, + "logits/chosen": 3.77734375, + "logits/rejected": 4.1796875, + "logps/chosen": -1092.0, + "logps/rejected": -1263.0, + "loss": 0.3457, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.541015625, + "rewards/margins": 9.8203125, + "rewards/rejected": -7.2734375, + "step": 4271 + }, + { + "epoch": 0.8476610943003126, + "grad_norm": 31.18888921654607, + "learning_rate": 1.6233016564755596e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 3.89453125, + "logps/chosen": -972.0, + "logps/rejected": -614.5, + "loss": 0.4069, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.318359375, + "rewards/margins": 7.6015625, + "rewards/rejected": -5.265625, + "step": 4272 + }, + { + "epoch": 0.8478595168411132, + "grad_norm": 32.472952241129235, + "learning_rate": 1.6217200183007623e-07, + "logits/chosen": 3.6328125, + "logits/rejected": 3.6171875, + "logps/chosen": -1476.0, + "logps/rejected": -942.0, + "loss": 0.4102, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.03125, + "rewards/margins": 9.7890625, + "rewards/rejected": -6.75, + "step": 4273 + }, + { + "epoch": 0.8480579393819138, + "grad_norm": 34.83581475793052, + "learning_rate": 1.620140240467982e-07, + "logits/chosen": 4.015625, + "logits/rejected": 3.875, + "logps/chosen": -816.0, + "logps/rejected": -600.5, + "loss": 0.4916, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.5546875, + "rewards/margins": 5.90625, + "rewards/rejected": -4.33203125, + "step": 4274 + }, + { + "epoch": 0.8482563619227145, + "grad_norm": 25.206111846942825, + "learning_rate": 1.618562323735009e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 4.07421875, + "logps/chosen": -1058.0, + "logps/rejected": -949.0, + "loss": 0.2642, + "rewards/accuracies": 0.90625, + "rewards/chosen": 4.0703125, + "rewards/margins": 11.125, + "rewards/rejected": -7.0546875, + "step": 4275 + }, + { + "epoch": 0.8484547844635151, + "grad_norm": 31.94030519366565, + "learning_rate": 1.6169862688587414e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.12890625, + "logps/chosen": -1035.5, + "logps/rejected": -693.5, + "loss": 0.413, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.119140625, + "rewards/margins": 9.3828125, + "rewards/rejected": -6.29296875, + "step": 4276 + }, + { + "epoch": 0.8486532070043157, + "grad_norm": 29.596783077072242, + "learning_rate": 1.615412076595188e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 3.92578125, + "logps/chosen": -715.0, + "logps/rejected": -573.5, + "loss": 0.5847, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.046875, + "rewards/margins": 5.71875, + "rewards/rejected": -3.68359375, + "step": 4277 + }, + { + "epoch": 0.8488516295451163, + "grad_norm": 25.77615419099429, + "learning_rate": 1.6138397476994556e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 3.80078125, + "logps/chosen": -1188.0, + "logps/rejected": -692.0, + "loss": 0.2892, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.84375, + "rewards/margins": 9.5078125, + "rewards/rejected": -6.65625, + "step": 4278 + }, + { + "epoch": 0.849050052085917, + "grad_norm": 33.10117431901978, + "learning_rate": 1.6122692829257685e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.14453125, + "logps/chosen": -1013.0, + "logps/rejected": -715.0, + "loss": 0.4125, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.78125, + "rewards/margins": 7.4765625, + "rewards/rejected": -4.6953125, + "step": 4279 + }, + { + "epoch": 0.8492484746267176, + "grad_norm": 23.849234990346098, + "learning_rate": 1.6107006830274446e-07, + "logits/chosen": 3.83984375, + "logits/rejected": 3.82421875, + "logps/chosen": -1275.0, + "logps/rejected": -904.5, + "loss": 0.4133, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.72265625, + "rewards/margins": 8.4765625, + "rewards/rejected": -5.748046875, + "step": 4280 + }, + { + "epoch": 0.8494468971675182, + "grad_norm": 36.86295763182246, + "learning_rate": 1.6091339487569178e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 3.8203125, + "logps/chosen": -840.5, + "logps/rejected": -665.0, + "loss": 0.4101, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.541015625, + "rewards/margins": 7.015625, + "rewards/rejected": -4.4765625, + "step": 4281 + }, + { + "epoch": 0.8496453197083189, + "grad_norm": 30.05378933905582, + "learning_rate": 1.6075690808657209e-07, + "logits/chosen": 3.61328125, + "logits/rejected": 3.625, + "logps/chosen": -1268.0, + "logps/rejected": -805.0, + "loss": 0.3386, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8359375, + "rewards/margins": 9.0390625, + "rewards/rejected": -6.1953125, + "step": 4282 + }, + { + "epoch": 0.8498437422491195, + "grad_norm": 36.928846953396246, + "learning_rate": 1.6060060801044924e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.1640625, + "logps/chosen": -985.0, + "logps/rejected": -538.0, + "loss": 0.5268, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.3134765625, + "rewards/margins": 5.66796875, + "rewards/rejected": -3.35546875, + "step": 4283 + }, + { + "epoch": 0.8500421647899201, + "grad_norm": 35.53058604588617, + "learning_rate": 1.60444494722298e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 4.1484375, + "logps/chosen": -806.0, + "logps/rejected": -694.0, + "loss": 0.3966, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.34375, + "rewards/margins": 8.48828125, + "rewards/rejected": -6.142578125, + "step": 4284 + }, + { + "epoch": 0.8502405873307207, + "grad_norm": 32.13311152610795, + "learning_rate": 1.6028856829700258e-07, + "logits/chosen": 4.046875, + "logits/rejected": 4.0, + "logps/chosen": -828.0, + "logps/rejected": -521.5, + "loss": 0.5296, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.408203125, + "rewards/margins": 6.16796875, + "rewards/rejected": -4.76171875, + "step": 4285 + }, + { + "epoch": 0.8504390098715214, + "grad_norm": 27.13878260544936, + "learning_rate": 1.6013282880935854e-07, + "logits/chosen": 4.3515625, + "logits/rejected": 4.37890625, + "logps/chosen": -1812.0, + "logps/rejected": -1696.0, + "loss": 0.2372, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.37109375, + "rewards/margins": 15.15625, + "rewards/rejected": -11.8046875, + "step": 4286 + }, + { + "epoch": 0.850637432412322, + "grad_norm": 30.992278117949063, + "learning_rate": 1.599772763340712e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.21484375, + "logps/chosen": -1000.0, + "logps/rejected": -879.0, + "loss": 0.4472, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.765625, + "rewards/margins": 7.8828125, + "rewards/rejected": -5.109375, + "step": 4287 + }, + { + "epoch": 0.8508358549531226, + "grad_norm": 29.665170895407883, + "learning_rate": 1.5982191094575634e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.0078125, + "logps/chosen": -675.0, + "logps/rejected": -535.0, + "loss": 0.5523, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.79833984375, + "rewards/margins": 4.671875, + "rewards/rejected": -2.865234375, + "step": 4288 + }, + { + "epoch": 0.8510342774939234, + "grad_norm": 31.41782943916311, + "learning_rate": 1.5966673271893994e-07, + "logits/chosen": 3.78515625, + "logits/rejected": 3.66015625, + "logps/chosen": -1294.0, + "logps/rejected": -858.0, + "loss": 0.2666, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.84375, + "rewards/margins": 9.8984375, + "rewards/rejected": -7.046875, + "step": 4289 + }, + { + "epoch": 0.851232700034724, + "grad_norm": 38.80960244226707, + "learning_rate": 1.5951174172805827e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.3203125, + "logps/chosen": -743.5, + "logps/rejected": -747.0, + "loss": 0.5253, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.57421875, + "rewards/margins": 5.171875, + "rewards/rejected": -2.6025390625, + "step": 4290 + }, + { + "epoch": 0.8514311225755246, + "grad_norm": 35.124907523871826, + "learning_rate": 1.593569380474576e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.0234375, + "logps/chosen": -854.0, + "logps/rejected": -617.5, + "loss": 0.5057, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.314453125, + "rewards/margins": 6.0625, + "rewards/rejected": -3.75390625, + "step": 4291 + }, + { + "epoch": 0.8516295451163252, + "grad_norm": 30.43346819210149, + "learning_rate": 1.5920232175139477e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.0, + "logps/chosen": -1043.0, + "logps/rejected": -1652.0, + "loss": 0.3474, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.236328125, + "rewards/margins": 10.390625, + "rewards/rejected": -8.1640625, + "step": 4292 + }, + { + "epoch": 0.8518279676571259, + "grad_norm": 21.81501125288097, + "learning_rate": 1.5904789291403636e-07, + "logits/chosen": 4.03125, + "logits/rejected": 4.203125, + "logps/chosen": -875.0, + "logps/rejected": -1342.5, + "loss": 0.3329, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.759765625, + "rewards/margins": 10.953125, + "rewards/rejected": -8.19921875, + "step": 4293 + }, + { + "epoch": 0.8520263901979265, + "grad_norm": 24.760376452136285, + "learning_rate": 1.588936516094591e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.796875, + "logps/chosen": -884.0, + "logps/rejected": -667.0, + "loss": 0.3406, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7265625, + "rewards/margins": 7.9453125, + "rewards/rejected": -5.2265625, + "step": 4294 + }, + { + "epoch": 0.8522248127387271, + "grad_norm": 26.62531591473465, + "learning_rate": 1.5873959791164986e-07, + "logits/chosen": 4.05078125, + "logits/rejected": 3.859375, + "logps/chosen": -833.5, + "logps/rejected": -605.0, + "loss": 0.4784, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4296875, + "rewards/margins": 6.1328125, + "rewards/rejected": -3.70703125, + "step": 4295 + }, + { + "epoch": 0.8524232352795278, + "grad_norm": 24.535508068861287, + "learning_rate": 1.5858573189450535e-07, + "logits/chosen": 3.5625, + "logits/rejected": 3.5625, + "logps/chosen": -899.0, + "logps/rejected": -583.0, + "loss": 0.2797, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.53125, + "rewards/margins": 7.828125, + "rewards/rejected": -4.3046875, + "step": 4296 + }, + { + "epoch": 0.8526216578203284, + "grad_norm": 29.987305906541554, + "learning_rate": 1.584320536318326e-07, + "logits/chosen": 4.515625, + "logits/rejected": 4.18359375, + "logps/chosen": -903.0, + "logps/rejected": -788.0, + "loss": 0.5169, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.84033203125, + "rewards/margins": 7.671875, + "rewards/rejected": -5.8203125, + "step": 4297 + }, + { + "epoch": 0.852820080361129, + "grad_norm": 28.330835254880032, + "learning_rate": 1.5827856319734805e-07, + "logits/chosen": 4.5859375, + "logits/rejected": 4.7109375, + "logps/chosen": -1264.0, + "logps/rejected": -1011.0, + "loss": 0.3914, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.193359375, + "rewards/margins": 10.2421875, + "rewards/rejected": -7.0390625, + "step": 4298 + }, + { + "epoch": 0.8530185029019297, + "grad_norm": 30.7557182622793, + "learning_rate": 1.5812526066467857e-07, + "logits/chosen": 4.203125, + "logits/rejected": 3.94140625, + "logps/chosen": -1458.0, + "logps/rejected": -1009.0, + "loss": 0.2139, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.6611328125, + "rewards/margins": 12.2109375, + "rewards/rejected": -8.5625, + "step": 4299 + }, + { + "epoch": 0.8532169254427303, + "grad_norm": 34.066286436219634, + "learning_rate": 1.5797214610736053e-07, + "logits/chosen": 3.77734375, + "logits/rejected": 4.12890625, + "logps/chosen": -1032.0, + "logps/rejected": -1695.0, + "loss": 0.3792, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.650390625, + "rewards/margins": 11.609375, + "rewards/rejected": -8.9453125, + "step": 4300 + }, + { + "epoch": 0.8534153479835309, + "grad_norm": 23.741656769801185, + "learning_rate": 1.5781921959884035e-07, + "logits/chosen": 4.03125, + "logits/rejected": 3.92578125, + "logps/chosen": -1295.0, + "logps/rejected": -707.0, + "loss": 0.3325, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.419921875, + "rewards/margins": 7.46875, + "rewards/rejected": -6.046875, + "step": 4301 + }, + { + "epoch": 0.8536137705243315, + "grad_norm": 29.257208569472716, + "learning_rate": 1.5766648121247411e-07, + "logits/chosen": 3.78515625, + "logits/rejected": 3.70703125, + "logps/chosen": -574.0, + "logps/rejected": -662.0, + "loss": 0.4929, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.0361328125, + "rewards/margins": 6.0078125, + "rewards/rejected": -3.962890625, + "step": 4302 + }, + { + "epoch": 0.8538121930651322, + "grad_norm": 37.07560585207833, + "learning_rate": 1.575139310215276e-07, + "logits/chosen": 4.3203125, + "logits/rejected": 4.2421875, + "logps/chosen": -1268.0, + "logps/rejected": -646.5, + "loss": 0.4401, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.966796875, + "rewards/margins": 9.0078125, + "rewards/rejected": -6.046875, + "step": 4303 + }, + { + "epoch": 0.8540106156059328, + "grad_norm": 24.043516784875173, + "learning_rate": 1.5736156909917673e-07, + "logits/chosen": 3.921875, + "logits/rejected": 3.8671875, + "logps/chosen": -1235.0, + "logps/rejected": -640.5, + "loss": 0.3696, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5693359375, + "rewards/margins": 8.48828125, + "rewards/rejected": -5.9140625, + "step": 4304 + }, + { + "epoch": 0.8542090381467334, + "grad_norm": 30.603278198518524, + "learning_rate": 1.5720939551850642e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.17578125, + "logps/chosen": -794.0, + "logps/rejected": -656.5, + "loss": 0.4212, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3935546875, + "rewards/margins": 6.62109375, + "rewards/rejected": -4.21875, + "step": 4305 + }, + { + "epoch": 0.8544074606875341, + "grad_norm": 39.624304412183015, + "learning_rate": 1.570574103525119e-07, + "logits/chosen": 3.47265625, + "logits/rejected": 3.8671875, + "logps/chosen": -586.5, + "logps/rejected": -730.0, + "loss": 0.6096, + "rewards/accuracies": 0.65625, + "rewards/chosen": 1.42578125, + "rewards/margins": 5.828125, + "rewards/rejected": -4.40234375, + "step": 4306 + }, + { + "epoch": 0.8546058832283348, + "grad_norm": 22.830492094553676, + "learning_rate": 1.5690561367409774e-07, + "logits/chosen": 3.72265625, + "logits/rejected": 3.8515625, + "logps/chosen": -649.0, + "logps/rejected": -592.5, + "loss": 0.4335, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.263671875, + "rewards/margins": 7.4140625, + "rewards/rejected": -5.1484375, + "step": 4307 + }, + { + "epoch": 0.8548043057691354, + "grad_norm": 31.236645999744795, + "learning_rate": 1.5675400555607794e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.12109375, + "logps/chosen": -897.5, + "logps/rejected": -549.5, + "loss": 0.3679, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.1962890625, + "rewards/margins": 6.69921875, + "rewards/rejected": -4.5, + "step": 4308 + }, + { + "epoch": 0.855002728309936, + "grad_norm": 23.648207254011954, + "learning_rate": 1.5660258607117655e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 4.11328125, + "logps/chosen": -1470.0, + "logps/rejected": -1493.0, + "loss": 0.244, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.08984375, + "rewards/margins": 12.359375, + "rewards/rejected": -9.296875, + "step": 4309 + }, + { + "epoch": 0.8552011508507367, + "grad_norm": 28.200377650019124, + "learning_rate": 1.5645135529202635e-07, + "logits/chosen": 4.29296875, + "logits/rejected": 4.32421875, + "logps/chosen": -813.0, + "logps/rejected": -639.5, + "loss": 0.3637, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.24609375, + "rewards/margins": 7.84375, + "rewards/rejected": -5.5859375, + "step": 4310 + }, + { + "epoch": 0.8553995733915373, + "grad_norm": 36.88130165252415, + "learning_rate": 1.5630031329117045e-07, + "logits/chosen": 3.7421875, + "logits/rejected": 3.76171875, + "logps/chosen": -1242.0, + "logps/rejected": -1973.0, + "loss": 0.4227, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.25390625, + "rewards/margins": 10.46875, + "rewards/rejected": -8.2109375, + "step": 4311 + }, + { + "epoch": 0.8555979959323379, + "grad_norm": 30.53489812399633, + "learning_rate": 1.5614946014106085e-07, + "logits/chosen": 3.953125, + "logits/rejected": 3.765625, + "logps/chosen": -878.5, + "logps/rejected": -560.0, + "loss": 0.4098, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3984375, + "rewards/margins": 7.3515625, + "rewards/rejected": -4.9609375, + "step": 4312 + }, + { + "epoch": 0.8557964184731386, + "grad_norm": 33.766056626977075, + "learning_rate": 1.5599879591405916e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.17578125, + "logps/chosen": -838.0, + "logps/rejected": -855.0, + "loss": 0.4357, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.142578125, + "rewards/margins": 7.1328125, + "rewards/rejected": -4.98046875, + "step": 4313 + }, + { + "epoch": 0.8559948410139392, + "grad_norm": 41.99723391565702, + "learning_rate": 1.5584832068243632e-07, + "logits/chosen": 4.015625, + "logits/rejected": 4.15625, + "logps/chosen": -842.25, + "logps/rejected": -1352.0, + "loss": 0.4024, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.060546875, + "rewards/margins": 9.38671875, + "rewards/rejected": -7.30078125, + "step": 4314 + }, + { + "epoch": 0.8561932635547398, + "grad_norm": 28.177144848001426, + "learning_rate": 1.5569803451837248e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 3.73828125, + "logps/chosen": -975.0, + "logps/rejected": -746.0, + "loss": 0.3626, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.818359375, + "rewards/margins": 9.484375, + "rewards/rejected": -6.6484375, + "step": 4315 + }, + { + "epoch": 0.8563916860955405, + "grad_norm": 35.24057832281483, + "learning_rate": 1.555479374939575e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.1328125, + "logps/chosen": -1094.5, + "logps/rejected": -697.5, + "loss": 0.3923, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.951171875, + "rewards/margins": 7.875, + "rewards/rejected": -5.9296875, + "step": 4316 + }, + { + "epoch": 0.8565901086363411, + "grad_norm": 18.21116910639581, + "learning_rate": 1.5539802968119017e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 4.1015625, + "logps/chosen": -1009.0, + "logps/rejected": -794.0, + "loss": 0.3696, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.841796875, + "rewards/margins": 8.265625, + "rewards/rejected": -5.42578125, + "step": 4317 + }, + { + "epoch": 0.8567885311771417, + "grad_norm": 40.88964654671343, + "learning_rate": 1.5524831115197863e-07, + "logits/chosen": 3.546875, + "logits/rejected": 3.38671875, + "logps/chosen": -1033.0, + "logps/rejected": -745.0, + "loss": 0.4628, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.548828125, + "rewards/margins": 7.1015625, + "rewards/rejected": -5.546875, + "step": 4318 + }, + { + "epoch": 0.8569869537179423, + "grad_norm": 26.468613413667292, + "learning_rate": 1.5509878197814002e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 4.4375, + "logps/chosen": -845.0, + "logps/rejected": -781.0, + "loss": 0.3186, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.626953125, + "rewards/margins": 8.7265625, + "rewards/rejected": -6.10546875, + "step": 4319 + }, + { + "epoch": 0.857185376258743, + "grad_norm": 41.19081086072824, + "learning_rate": 1.5494944223140107e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.3984375, + "logps/chosen": -988.0, + "logps/rejected": -909.0, + "loss": 0.3924, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9765625, + "rewards/margins": 7.6875, + "rewards/rejected": -4.701171875, + "step": 4320 + }, + { + "epoch": 0.8573837987995436, + "grad_norm": 22.688352307352393, + "learning_rate": 1.548002919833971e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.37890625, + "logps/chosen": -914.5, + "logps/rejected": -1044.0, + "loss": 0.2656, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.087890625, + "rewards/margins": 11.1875, + "rewards/rejected": -8.109375, + "step": 4321 + }, + { + "epoch": 0.8575822213403442, + "grad_norm": 27.840864479187704, + "learning_rate": 1.5465133130567332e-07, + "logits/chosen": 3.8359375, + "logits/rejected": 3.75390625, + "logps/chosen": -1060.0, + "logps/rejected": -885.5, + "loss": 0.3428, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4609375, + "rewards/margins": 9.1171875, + "rewards/rejected": -6.6484375, + "step": 4322 + }, + { + "epoch": 0.8577806438811449, + "grad_norm": 32.40853816830528, + "learning_rate": 1.5450256026968294e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 4.1484375, + "logps/chosen": -1081.0, + "logps/rejected": -1081.5, + "loss": 0.2828, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.51953125, + "rewards/margins": 12.375, + "rewards/rejected": -8.8671875, + "step": 4323 + }, + { + "epoch": 0.8579790664219455, + "grad_norm": 33.51045303767039, + "learning_rate": 1.543539789467892e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.140625, + "logps/chosen": -990.0, + "logps/rejected": -826.0, + "loss": 0.4038, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.390625, + "rewards/margins": 8.078125, + "rewards/rejected": -5.67578125, + "step": 4324 + }, + { + "epoch": 0.8581774889627461, + "grad_norm": 38.662021767793235, + "learning_rate": 1.5420558740826383e-07, + "logits/chosen": 3.4609375, + "logits/rejected": 3.84765625, + "logps/chosen": -1040.0, + "logps/rejected": -771.0, + "loss": 0.3664, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1796875, + "rewards/margins": 7.28125, + "rewards/rejected": -4.109375, + "step": 4325 + }, + { + "epoch": 0.8583759115035468, + "grad_norm": 28.95442316818472, + "learning_rate": 1.5405738572528753e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 3.59765625, + "logps/chosen": -783.0, + "logps/rejected": -597.5, + "loss": 0.4774, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.50390625, + "rewards/margins": 6.90625, + "rewards/rejected": -4.3984375, + "step": 4326 + }, + { + "epoch": 0.8585743340443475, + "grad_norm": 26.60057063787768, + "learning_rate": 1.5390937396895006e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.38671875, + "logps/chosen": -868.0, + "logps/rejected": -784.0, + "loss": 0.3291, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.70703125, + "rewards/margins": 9.578125, + "rewards/rejected": -6.875, + "step": 4327 + }, + { + "epoch": 0.8587727565851481, + "grad_norm": 39.490031839036504, + "learning_rate": 1.5376155221024995e-07, + "logits/chosen": 4.14453125, + "logits/rejected": 3.9375, + "logps/chosen": -1135.0, + "logps/rejected": -1391.0, + "loss": 0.3603, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6484375, + "rewards/margins": 10.1328125, + "rewards/rejected": -7.47265625, + "step": 4328 + }, + { + "epoch": 0.8589711791259487, + "grad_norm": 35.46821426112372, + "learning_rate": 1.5361392052009492e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 3.94921875, + "logps/chosen": -1227.0, + "logps/rejected": -1347.0, + "loss": 0.3785, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.037109375, + "rewards/margins": 9.5859375, + "rewards/rejected": -7.5703125, + "step": 4329 + }, + { + "epoch": 0.8591696016667494, + "grad_norm": 28.01073121492884, + "learning_rate": 1.5346647896930093e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 3.9375, + "logps/chosen": -1020.0, + "logps/rejected": -577.0, + "loss": 0.3179, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.02734375, + "rewards/margins": 7.765625, + "rewards/rejected": -4.7421875, + "step": 4330 + }, + { + "epoch": 0.85936802420755, + "grad_norm": 22.15378741133396, + "learning_rate": 1.5331922762859337e-07, + "logits/chosen": 4.31640625, + "logits/rejected": 4.2734375, + "logps/chosen": -956.0, + "logps/rejected": -1111.0, + "loss": 0.235, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.6640625, + "rewards/margins": 11.890625, + "rewards/rejected": -8.21875, + "step": 4331 + }, + { + "epoch": 0.8595664467483506, + "grad_norm": 28.436076652348053, + "learning_rate": 1.5317216656860587e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.14453125, + "logps/chosen": -821.0, + "logps/rejected": -716.0, + "loss": 0.47, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.68310546875, + "rewards/margins": 7.5390625, + "rewards/rejected": -5.8515625, + "step": 4332 + }, + { + "epoch": 0.8597648692891513, + "grad_norm": 21.84146776689914, + "learning_rate": 1.530252958598811e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 4.1015625, + "logps/chosen": -1108.0, + "logps/rejected": -649.0, + "loss": 0.2554, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.52734375, + "rewards/margins": 9.984375, + "rewards/rejected": -5.4609375, + "step": 4333 + }, + { + "epoch": 0.8599632918299519, + "grad_norm": 25.257102255434138, + "learning_rate": 1.5287861557287052e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.625, + "logps/chosen": -861.5, + "logps/rejected": -918.0, + "loss": 0.5851, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.9140625, + "rewards/margins": 7.2421875, + "rewards/rejected": -5.33203125, + "step": 4334 + }, + { + "epoch": 0.8601617143707525, + "grad_norm": 30.290674984670925, + "learning_rate": 1.5273212577793366e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 3.9375, + "logps/chosen": -1109.0, + "logps/rejected": -716.0, + "loss": 0.4195, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3828125, + "rewards/margins": 7.21875, + "rewards/rejected": -4.828125, + "step": 4335 + }, + { + "epoch": 0.8603601369115531, + "grad_norm": 26.692488138519504, + "learning_rate": 1.525858265453394e-07, + "logits/chosen": 4.0625, + "logits/rejected": 3.82421875, + "logps/chosen": -806.0, + "logps/rejected": -609.0, + "loss": 0.4498, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.470703125, + "rewards/margins": 6.84375, + "rewards/rejected": -5.3828125, + "step": 4336 + }, + { + "epoch": 0.8605585594523538, + "grad_norm": 28.11052858089441, + "learning_rate": 1.5243971794526475e-07, + "logits/chosen": 4.23046875, + "logits/rejected": 3.77734375, + "logps/chosen": -1200.0, + "logps/rejected": -784.0, + "loss": 0.3174, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.232421875, + "rewards/margins": 9.1875, + "rewards/rejected": -5.9609375, + "step": 4337 + }, + { + "epoch": 0.8607569819931544, + "grad_norm": 26.36242909232533, + "learning_rate": 1.522938000477954e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 4.140625, + "logps/chosen": -891.0, + "logps/rejected": -863.0, + "loss": 0.4194, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.419921875, + "rewards/margins": 7.578125, + "rewards/rejected": -5.16015625, + "step": 4338 + }, + { + "epoch": 0.860955404533955, + "grad_norm": 25.22485921952292, + "learning_rate": 1.5214807292292563e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.37890625, + "logps/chosen": -1443.5, + "logps/rejected": -941.5, + "loss": 0.392, + "rewards/accuracies": 0.875, + "rewards/chosen": 4.01171875, + "rewards/margins": 9.3046875, + "rewards/rejected": -5.30859375, + "step": 4339 + }, + { + "epoch": 0.8611538270747557, + "grad_norm": 35.38075624505625, + "learning_rate": 1.5200253664055805e-07, + "logits/chosen": 4.3828125, + "logits/rejected": 4.3984375, + "logps/chosen": -936.0, + "logps/rejected": -1588.5, + "loss": 0.309, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.314453125, + "rewards/margins": 11.234375, + "rewards/rejected": -7.91796875, + "step": 4340 + }, + { + "epoch": 0.8613522496155563, + "grad_norm": 30.602945776176306, + "learning_rate": 1.5185719127050398e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 4.0703125, + "logps/chosen": -892.0, + "logps/rejected": -1847.5, + "loss": 0.4105, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1328125, + "rewards/margins": 12.1171875, + "rewards/rejected": -8.98046875, + "step": 4341 + }, + { + "epoch": 0.861550672156357, + "grad_norm": 25.280567474195493, + "learning_rate": 1.5171203688248297e-07, + "logits/chosen": 3.59375, + "logits/rejected": 3.328125, + "logps/chosen": -1017.0, + "logps/rejected": -680.0, + "loss": 0.3306, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.150390625, + "rewards/margins": 8.3125, + "rewards/rejected": -5.1640625, + "step": 4342 + }, + { + "epoch": 0.8617490946971575, + "grad_norm": 31.421199581411223, + "learning_rate": 1.5156707354612295e-07, + "logits/chosen": 3.58984375, + "logits/rejected": 3.390625, + "logps/chosen": -1201.0, + "logps/rejected": -998.0, + "loss": 0.3695, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.73828125, + "rewards/margins": 18.1875, + "rewards/rejected": -15.5, + "step": 4343 + }, + { + "epoch": 0.8619475172379583, + "grad_norm": 20.479243010431336, + "learning_rate": 1.5142230133096023e-07, + "logits/chosen": 4.53125, + "logits/rejected": 4.12890625, + "logps/chosen": -1246.0, + "logps/rejected": -738.0, + "loss": 0.2796, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.26171875, + "rewards/margins": 10.890625, + "rewards/rejected": -7.6328125, + "step": 4344 + }, + { + "epoch": 0.8621459397787589, + "grad_norm": 27.0268779169348, + "learning_rate": 1.5127772030643958e-07, + "logits/chosen": 3.80859375, + "logits/rejected": 3.88671875, + "logps/chosen": -930.0, + "logps/rejected": -677.0, + "loss": 0.3163, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.59375, + "rewards/margins": 8.8671875, + "rewards/rejected": -5.265625, + "step": 4345 + }, + { + "epoch": 0.8623443623195595, + "grad_norm": 30.7194748862068, + "learning_rate": 1.5113333054191385e-07, + "logits/chosen": 4.09375, + "logits/rejected": 4.15625, + "logps/chosen": -988.5, + "logps/rejected": -1019.5, + "loss": 0.256, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.666015625, + "rewards/margins": 11.8125, + "rewards/rejected": -8.1484375, + "step": 4346 + }, + { + "epoch": 0.8625427848603602, + "grad_norm": 33.34444475358857, + "learning_rate": 1.5098913210664418e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 4.21484375, + "logps/chosen": -863.0, + "logps/rejected": -929.0, + "loss": 0.472, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.3125, + "rewards/margins": 9.2265625, + "rewards/rejected": -6.9296875, + "step": 4347 + }, + { + "epoch": 0.8627412074011608, + "grad_norm": 29.726919512837494, + "learning_rate": 1.5084512506980023e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 4.34375, + "logps/chosen": -1015.0, + "logps/rejected": -813.0, + "loss": 0.4374, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.24169921875, + "rewards/margins": 8.2578125, + "rewards/rejected": -6.015625, + "step": 4348 + }, + { + "epoch": 0.8629396299419614, + "grad_norm": 28.481022068586874, + "learning_rate": 1.5070130950045947e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.42578125, + "logps/chosen": -933.0, + "logps/rejected": -608.0, + "loss": 0.3091, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.673828125, + "rewards/margins": 7.859375, + "rewards/rejected": -5.1796875, + "step": 4349 + }, + { + "epoch": 0.8631380524827621, + "grad_norm": 34.86378472899043, + "learning_rate": 1.505576854676077e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.359375, + "logps/chosen": -990.0, + "logps/rejected": -762.0, + "loss": 0.4763, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.50390625, + "rewards/margins": 9.0078125, + "rewards/rejected": -6.484375, + "step": 4350 + }, + { + "epoch": 0.8633364750235627, + "grad_norm": 37.20538992344488, + "learning_rate": 1.5041425304013878e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.1953125, + "logps/chosen": -953.0, + "logps/rejected": -689.5, + "loss": 0.4502, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.724609375, + "rewards/margins": 8.08203125, + "rewards/rejected": -6.3671875, + "step": 4351 + }, + { + "epoch": 0.8635348975643633, + "grad_norm": 24.33499046765554, + "learning_rate": 1.5027101228685475e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 3.83203125, + "logps/chosen": -1099.0, + "logps/rejected": -730.5, + "loss": 0.3885, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.765625, + "rewards/margins": 9.8203125, + "rewards/rejected": -6.0390625, + "step": 4352 + }, + { + "epoch": 0.8637333201051639, + "grad_norm": 36.1156039910578, + "learning_rate": 1.5012796327646555e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 4.00390625, + "logps/chosen": -1292.0, + "logps/rejected": -851.0, + "loss": 0.277, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.12890625, + "rewards/margins": 9.984375, + "rewards/rejected": -6.84375, + "step": 4353 + }, + { + "epoch": 0.8639317426459646, + "grad_norm": 29.00440244026672, + "learning_rate": 1.4998510607758948e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.734375, + "logps/chosen": -1031.0, + "logps/rejected": -1702.0, + "loss": 0.3803, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.271484375, + "rewards/margins": 20.0625, + "rewards/rejected": -17.765625, + "step": 4354 + }, + { + "epoch": 0.8641301651867652, + "grad_norm": 41.50569403813497, + "learning_rate": 1.498424407587523e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 3.6328125, + "logps/chosen": -948.0, + "logps/rejected": -705.0, + "loss": 0.3276, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.52734375, + "rewards/margins": 8.2109375, + "rewards/rejected": -5.6796875, + "step": 4355 + }, + { + "epoch": 0.8643285877275658, + "grad_norm": 37.83008123383028, + "learning_rate": 1.496999673883882e-07, + "logits/chosen": 4.375, + "logits/rejected": 4.28125, + "logps/chosen": -1195.0, + "logps/rejected": -926.0, + "loss": 0.4389, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5078125, + "rewards/margins": 7.6171875, + "rewards/rejected": -5.109375, + "step": 4356 + }, + { + "epoch": 0.8645270102683665, + "grad_norm": 33.25766888226679, + "learning_rate": 1.4955768603483915e-07, + "logits/chosen": 3.91015625, + "logits/rejected": 3.84765625, + "logps/chosen": -1066.0, + "logps/rejected": -743.0, + "loss": 0.428, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.626953125, + "rewards/margins": 8.421875, + "rewards/rejected": -5.78515625, + "step": 4357 + }, + { + "epoch": 0.8647254328091671, + "grad_norm": 20.461956840801278, + "learning_rate": 1.4941559676635487e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 3.83203125, + "logps/chosen": -947.0, + "logps/rejected": -686.0, + "loss": 0.3234, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.6796875, + "rewards/margins": 9.171875, + "rewards/rejected": -5.5, + "step": 4358 + }, + { + "epoch": 0.8649238553499677, + "grad_norm": 31.402058807179856, + "learning_rate": 1.4927369965109333e-07, + "logits/chosen": 3.7421875, + "logits/rejected": 3.72265625, + "logps/chosen": -899.5, + "logps/rejected": -580.0, + "loss": 0.4603, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.134765625, + "rewards/margins": 6.63671875, + "rewards/rejected": -4.505859375, + "step": 4359 + }, + { + "epoch": 0.8651222778907683, + "grad_norm": 31.113530103238812, + "learning_rate": 1.491319947571196e-07, + "logits/chosen": 4.4765625, + "logits/rejected": 4.16015625, + "logps/chosen": -1154.0, + "logps/rejected": -738.0, + "loss": 0.4243, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.982421875, + "rewards/margins": 8.38671875, + "rewards/rejected": -5.39453125, + "step": 4360 + }, + { + "epoch": 0.8653207004315691, + "grad_norm": 31.084247515175377, + "learning_rate": 1.489904821524074e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.96875, + "logps/chosen": -978.0, + "logps/rejected": -806.0, + "loss": 0.3687, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8828125, + "rewards/margins": 9.078125, + "rewards/rejected": -6.21484375, + "step": 4361 + }, + { + "epoch": 0.8655191229723697, + "grad_norm": 30.208233414757977, + "learning_rate": 1.4884916190483764e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.11328125, + "logps/chosen": -940.0, + "logps/rejected": -717.0, + "loss": 0.2738, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.35546875, + "rewards/margins": 8.5625, + "rewards/rejected": -5.203125, + "step": 4362 + }, + { + "epoch": 0.8657175455131703, + "grad_norm": 25.5035586377348, + "learning_rate": 1.4870803408219912e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.171875, + "logps/chosen": -773.5, + "logps/rejected": -795.5, + "loss": 0.4136, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.26171875, + "rewards/margins": 18.25, + "rewards/rejected": -15.9609375, + "step": 4363 + }, + { + "epoch": 0.865915968053971, + "grad_norm": 34.89610027042089, + "learning_rate": 1.4856709875218857e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.05078125, + "logps/chosen": -1390.0, + "logps/rejected": -779.0, + "loss": 0.3667, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.779296875, + "rewards/margins": 8.203125, + "rewards/rejected": -5.4296875, + "step": 4364 + }, + { + "epoch": 0.8661143905947716, + "grad_norm": 32.028142100213564, + "learning_rate": 1.4842635598240978e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.3828125, + "logps/chosen": -851.0, + "logps/rejected": -889.0, + "loss": 0.4214, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3359375, + "rewards/margins": 7.80078125, + "rewards/rejected": -5.46484375, + "step": 4365 + }, + { + "epoch": 0.8663128131355722, + "grad_norm": 27.851653158831372, + "learning_rate": 1.4828580584037491e-07, + "logits/chosen": 4.328125, + "logits/rejected": 4.01953125, + "logps/chosen": -1159.0, + "logps/rejected": -732.0, + "loss": 0.2971, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.017578125, + "rewards/margins": 8.796875, + "rewards/rejected": -5.78125, + "step": 4366 + }, + { + "epoch": 0.8665112356763728, + "grad_norm": 30.56729908808112, + "learning_rate": 1.4814544839350304e-07, + "logits/chosen": 3.78125, + "logits/rejected": 3.7578125, + "logps/chosen": -880.0, + "logps/rejected": -662.0, + "loss": 0.4169, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.33984375, + "rewards/margins": 6.365234375, + "rewards/rejected": -4.038818359375, + "step": 4367 + }, + { + "epoch": 0.8667096582171735, + "grad_norm": 19.76177825126675, + "learning_rate": 1.4800528370912143e-07, + "logits/chosen": 3.65234375, + "logits/rejected": 3.47265625, + "logps/chosen": -959.0, + "logps/rejected": -602.5, + "loss": 0.3122, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.59765625, + "rewards/margins": 7.8046875, + "rewards/rejected": -5.19140625, + "step": 4368 + }, + { + "epoch": 0.8669080807579741, + "grad_norm": 20.56133065269182, + "learning_rate": 1.4786531185446452e-07, + "logits/chosen": 4.34765625, + "logits/rejected": 3.9140625, + "logps/chosen": -1233.0, + "logps/rejected": -870.0, + "loss": 0.3146, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.8828125, + "rewards/margins": 10.84375, + "rewards/rejected": -6.9609375, + "step": 4369 + }, + { + "epoch": 0.8671065032987747, + "grad_norm": 23.456350304894723, + "learning_rate": 1.4772553289667424e-07, + "logits/chosen": 4.578125, + "logits/rejected": 4.4921875, + "logps/chosen": -906.0, + "logps/rejected": -579.5, + "loss": 0.3629, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.302734375, + "rewards/margins": 7.3671875, + "rewards/rejected": -5.0703125, + "step": 4370 + }, + { + "epoch": 0.8673049258395754, + "grad_norm": 33.236375536922914, + "learning_rate": 1.4758594690280017e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.140625, + "logps/chosen": -623.0, + "logps/rejected": -576.5, + "loss": 0.5741, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.736328125, + "rewards/margins": 5.98046875, + "rewards/rejected": -4.259765625, + "step": 4371 + }, + { + "epoch": 0.867503348380376, + "grad_norm": 24.176072073506525, + "learning_rate": 1.4744655393979905e-07, + "logits/chosen": 4.625, + "logits/rejected": 4.4921875, + "logps/chosen": -1090.0, + "logps/rejected": -689.5, + "loss": 0.3289, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.279296875, + "rewards/margins": 8.953125, + "rewards/rejected": -5.6796875, + "step": 4372 + }, + { + "epoch": 0.8677017709211766, + "grad_norm": 34.25172495940537, + "learning_rate": 1.473073540745354e-07, + "logits/chosen": 3.9375, + "logits/rejected": 4.14453125, + "logps/chosen": -1124.0, + "logps/rejected": -689.5, + "loss": 0.3304, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7109375, + "rewards/margins": 8.890625, + "rewards/rejected": -6.1796875, + "step": 4373 + }, + { + "epoch": 0.8679001934619773, + "grad_norm": 26.452823707974186, + "learning_rate": 1.471683473737809e-07, + "logits/chosen": 3.51953125, + "logits/rejected": 3.375, + "logps/chosen": -766.0, + "logps/rejected": -684.0, + "loss": 0.4839, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.029296875, + "rewards/margins": 12.734375, + "rewards/rejected": -10.734375, + "step": 4374 + }, + { + "epoch": 0.8680986160027779, + "grad_norm": 27.17343833168946, + "learning_rate": 1.4702953390421457e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 4.1015625, + "logps/chosen": -892.0, + "logps/rejected": -922.0, + "loss": 0.3752, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.41796875, + "rewards/margins": 10.34375, + "rewards/rejected": -7.9453125, + "step": 4375 + }, + { + "epoch": 0.8682970385435785, + "grad_norm": 25.285890497329312, + "learning_rate": 1.468909137324227e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.13671875, + "logps/chosen": -1344.0, + "logps/rejected": -2239.0, + "loss": 0.3243, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.24609375, + "rewards/margins": 13.078125, + "rewards/rejected": -9.828125, + "step": 4376 + }, + { + "epoch": 0.8684954610843791, + "grad_norm": 32.60638675279232, + "learning_rate": 1.4675248692489902e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 4.3125, + "logps/chosen": -922.0, + "logps/rejected": -1322.5, + "loss": 0.4035, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.310546875, + "rewards/margins": 9.12109375, + "rewards/rejected": -6.8046875, + "step": 4377 + }, + { + "epoch": 0.8686938836251799, + "grad_norm": 29.569126989861374, + "learning_rate": 1.466142535480443e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.5234375, + "logps/chosen": -1062.0, + "logps/rejected": -2011.0, + "loss": 0.2436, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.8359375, + "rewards/margins": 16.125, + "rewards/rejected": -12.296875, + "step": 4378 + }, + { + "epoch": 0.8688923061659805, + "grad_norm": 33.22829215893372, + "learning_rate": 1.4647621366816672e-07, + "logits/chosen": 3.91015625, + "logits/rejected": 3.6875, + "logps/chosen": -870.0, + "logps/rejected": -640.0, + "loss": 0.46, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.68359375, + "rewards/margins": 7.6015625, + "rewards/rejected": -4.93359375, + "step": 4379 + }, + { + "epoch": 0.8690907287067811, + "grad_norm": 30.983139092969633, + "learning_rate": 1.463383673514816e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 3.9296875, + "logps/chosen": -951.0, + "logps/rejected": -700.0, + "loss": 0.4456, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.11328125, + "rewards/margins": 7.6796875, + "rewards/rejected": -5.57421875, + "step": 4380 + }, + { + "epoch": 0.8692891512475818, + "grad_norm": 24.87477039877925, + "learning_rate": 1.4620071466411125e-07, + "logits/chosen": 3.59765625, + "logits/rejected": 3.80078125, + "logps/chosen": -1168.0, + "logps/rejected": -1112.0, + "loss": 0.4503, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7060546875, + "rewards/margins": 9.23046875, + "rewards/rejected": -6.50390625, + "step": 4381 + }, + { + "epoch": 0.8694875737883824, + "grad_norm": 27.403048242439354, + "learning_rate": 1.4606325567208533e-07, + "logits/chosen": 3.51171875, + "logits/rejected": 3.59765625, + "logps/chosen": -915.0, + "logps/rejected": -745.0, + "loss": 0.4112, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.208984375, + "rewards/margins": 7.875, + "rewards/rejected": -5.66796875, + "step": 4382 + }, + { + "epoch": 0.869685996329183, + "grad_norm": 35.602306137761126, + "learning_rate": 1.459259904413403e-07, + "logits/chosen": 3.96875, + "logits/rejected": 4.1484375, + "logps/chosen": -1239.0, + "logps/rejected": -1018.0, + "loss": 0.302, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.59765625, + "rewards/margins": 10.8125, + "rewards/rejected": -8.21875, + "step": 4383 + }, + { + "epoch": 0.8698844188699836, + "grad_norm": 29.867301907145375, + "learning_rate": 1.4578891903772018e-07, + "logits/chosen": 4.140625, + "logits/rejected": 3.9921875, + "logps/chosen": -892.0, + "logps/rejected": -568.5, + "loss": 0.4302, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.287109375, + "rewards/margins": 6.59375, + "rewards/rejected": -4.3125, + "step": 4384 + }, + { + "epoch": 0.8700828414107843, + "grad_norm": 27.78179653650017, + "learning_rate": 1.4565204152697528e-07, + "logits/chosen": 3.3984375, + "logits/rejected": 3.4140625, + "logps/chosen": -766.0, + "logps/rejected": -676.0, + "loss": 0.5145, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.65234375, + "rewards/margins": 6.37109375, + "rewards/rejected": -4.71875, + "step": 4385 + }, + { + "epoch": 0.8702812639515849, + "grad_norm": 38.8394842517114, + "learning_rate": 1.455153579747637e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.234375, + "logps/chosen": -924.0, + "logps/rejected": -662.5, + "loss": 0.3376, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.46875, + "rewards/margins": 9.984375, + "rewards/rejected": -7.5078125, + "step": 4386 + }, + { + "epoch": 0.8704796864923855, + "grad_norm": 33.86097512972803, + "learning_rate": 1.4537886844664976e-07, + "logits/chosen": 4.28515625, + "logits/rejected": 4.40234375, + "logps/chosen": -1061.0, + "logps/rejected": -789.0, + "loss": 0.431, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.845703125, + "rewards/margins": 8.8203125, + "rewards/rejected": -5.984375, + "step": 4387 + }, + { + "epoch": 0.8706781090331862, + "grad_norm": 26.605826678530605, + "learning_rate": 1.4524257300810532e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 3.80078125, + "logps/chosen": -995.5, + "logps/rejected": -700.0, + "loss": 0.4788, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.193603515625, + "rewards/margins": 7.9296875, + "rewards/rejected": -5.74609375, + "step": 4388 + }, + { + "epoch": 0.8708765315739868, + "grad_norm": 36.239007130103325, + "learning_rate": 1.451064717245088e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 4.04296875, + "logps/chosen": -775.0, + "logps/rejected": -713.0, + "loss": 0.6075, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.841796875, + "rewards/margins": 6.1796875, + "rewards/rejected": -4.3359375, + "step": 4389 + }, + { + "epoch": 0.8710749541147874, + "grad_norm": 18.218250240081225, + "learning_rate": 1.449705646611455e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.359375, + "logps/chosen": -1026.0, + "logps/rejected": -778.5, + "loss": 0.29, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9765625, + "rewards/margins": 9.6015625, + "rewards/rejected": -6.61328125, + "step": 4390 + }, + { + "epoch": 0.8712733766555881, + "grad_norm": 36.92612655956419, + "learning_rate": 1.4483485188320777e-07, + "logits/chosen": 3.42578125, + "logits/rejected": 3.703125, + "logps/chosen": -891.0, + "logps/rejected": -682.0, + "loss": 0.4466, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.701171875, + "rewards/margins": 7.6171875, + "rewards/rejected": -5.92578125, + "step": 4391 + }, + { + "epoch": 0.8714717991963887, + "grad_norm": 40.00296046811454, + "learning_rate": 1.446993334557944e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.06640625, + "logps/chosen": -1197.0, + "logps/rejected": -1258.5, + "loss": 0.495, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.14453125, + "rewards/margins": 8.78125, + "rewards/rejected": -6.625, + "step": 4392 + }, + { + "epoch": 0.8716702217371893, + "grad_norm": 28.501790743509975, + "learning_rate": 1.4456400944391144e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 4.1015625, + "logps/chosen": -1254.0, + "logps/rejected": -1152.0, + "loss": 0.2958, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.720703125, + "rewards/margins": 11.8125, + "rewards/rejected": -9.078125, + "step": 4393 + }, + { + "epoch": 0.8718686442779899, + "grad_norm": 27.848728415149584, + "learning_rate": 1.444288799124712e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.1328125, + "logps/chosen": -860.0, + "logps/rejected": -756.5, + "loss": 0.4224, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.734375, + "rewards/margins": 7.3203125, + "rewards/rejected": -4.578125, + "step": 4394 + }, + { + "epoch": 0.8720670668187906, + "grad_norm": 32.10241014383643, + "learning_rate": 1.4429394492629296e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 3.8984375, + "logps/chosen": -1097.0, + "logps/rejected": -893.0, + "loss": 0.3617, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.033203125, + "rewards/margins": 8.8046875, + "rewards/rejected": -6.765625, + "step": 4395 + }, + { + "epoch": 0.8722654893595913, + "grad_norm": 28.09437320929703, + "learning_rate": 1.4415920455010283e-07, + "logits/chosen": 4.125, + "logits/rejected": 3.7578125, + "logps/chosen": -1072.0, + "logps/rejected": -802.5, + "loss": 0.3793, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.197265625, + "rewards/margins": 7.9765625, + "rewards/rejected": -5.7890625, + "step": 4396 + }, + { + "epoch": 0.8724639119003919, + "grad_norm": 28.447341675859178, + "learning_rate": 1.4402465884853301e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.06640625, + "logps/chosen": -1170.0, + "logps/rejected": -912.0, + "loss": 0.346, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.146484375, + "rewards/margins": 10.296875, + "rewards/rejected": -7.1328125, + "step": 4397 + }, + { + "epoch": 0.8726623344411926, + "grad_norm": 27.294618246062885, + "learning_rate": 1.43890307886123e-07, + "logits/chosen": 3.765625, + "logits/rejected": 3.6015625, + "logps/chosen": -1272.0, + "logps/rejected": -845.0, + "loss": 0.3863, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.76171875, + "rewards/margins": 9.3046875, + "rewards/rejected": -6.546875, + "step": 4398 + }, + { + "epoch": 0.8728607569819932, + "grad_norm": 28.232459347555164, + "learning_rate": 1.437561517273185e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 4.3203125, + "logps/chosen": -824.5, + "logps/rejected": -1124.0, + "loss": 0.3251, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.36328125, + "rewards/margins": 12.890625, + "rewards/rejected": -9.53125, + "step": 4399 + }, + { + "epoch": 0.8730591795227938, + "grad_norm": 29.37533852048217, + "learning_rate": 1.4362219043647175e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 4.1015625, + "logps/chosen": -916.0, + "logps/rejected": -807.0, + "loss": 0.4274, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4248046875, + "rewards/margins": 7.4296875, + "rewards/rejected": -5.0, + "step": 4400 + }, + { + "epoch": 0.8732576020635944, + "grad_norm": 42.03467935311878, + "learning_rate": 1.4348842407784166e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 4.203125, + "logps/chosen": -789.0, + "logps/rejected": -538.0, + "loss": 0.3112, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.87890625, + "rewards/margins": 9.203125, + "rewards/rejected": -6.3359375, + "step": 4401 + }, + { + "epoch": 0.8734560246043951, + "grad_norm": 23.54695711319812, + "learning_rate": 1.4335485271559357e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.078125, + "logps/chosen": -783.0, + "logps/rejected": -649.0, + "loss": 0.3662, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.73828125, + "rewards/margins": 8.3828125, + "rewards/rejected": -5.640625, + "step": 4402 + }, + { + "epoch": 0.8736544471451957, + "grad_norm": 43.15861343931206, + "learning_rate": 1.432214764137992e-07, + "logits/chosen": 4.2265625, + "logits/rejected": 4.30078125, + "logps/chosen": -804.5, + "logps/rejected": -999.0, + "loss": 0.3645, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.443359375, + "rewards/margins": 10.0703125, + "rewards/rejected": -7.6171875, + "step": 4403 + }, + { + "epoch": 0.8738528696859963, + "grad_norm": 32.787077756971634, + "learning_rate": 1.4308829523643694e-07, + "logits/chosen": 3.921875, + "logits/rejected": 4.13671875, + "logps/chosen": -934.0, + "logps/rejected": -1221.0, + "loss": 0.454, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.71484375, + "rewards/margins": 8.84375, + "rewards/rejected": -6.140625, + "step": 4404 + }, + { + "epoch": 0.874051292226797, + "grad_norm": 25.56543537371411, + "learning_rate": 1.4295530924739139e-07, + "logits/chosen": 4.20703125, + "logits/rejected": 4.0625, + "logps/chosen": -775.0, + "logps/rejected": -612.0, + "loss": 0.4179, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.587890625, + "rewards/margins": 7.37890625, + "rewards/rejected": -4.802734375, + "step": 4405 + }, + { + "epoch": 0.8742497147675976, + "grad_norm": 32.79111521487625, + "learning_rate": 1.4282251851045353e-07, + "logits/chosen": 3.73046875, + "logits/rejected": 3.98046875, + "logps/chosen": -1010.0, + "logps/rejected": -1070.0, + "loss": 0.433, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.658203125, + "rewards/margins": 9.015625, + "rewards/rejected": -6.359375, + "step": 4406 + }, + { + "epoch": 0.8744481373083982, + "grad_norm": 29.125771549779643, + "learning_rate": 1.426899230893207e-07, + "logits/chosen": 4.26171875, + "logits/rejected": 4.4453125, + "logps/chosen": -840.0, + "logps/rejected": -762.0, + "loss": 0.4002, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.138671875, + "rewards/margins": 7.59375, + "rewards/rejected": -5.453125, + "step": 4407 + }, + { + "epoch": 0.8746465598491989, + "grad_norm": 25.611534830810015, + "learning_rate": 1.425575230475966e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 4.015625, + "logps/chosen": -1020.0, + "logps/rejected": -659.5, + "loss": 0.3534, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.142578125, + "rewards/margins": 8.9296875, + "rewards/rejected": -5.7890625, + "step": 4408 + }, + { + "epoch": 0.8748449823899995, + "grad_norm": 28.9912898122279, + "learning_rate": 1.424253184487913e-07, + "logits/chosen": 3.73046875, + "logits/rejected": 3.6640625, + "logps/chosen": -782.0, + "logps/rejected": -536.5, + "loss": 0.2917, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.435546875, + "rewards/margins": 8.6796875, + "rewards/rejected": -6.2265625, + "step": 4409 + }, + { + "epoch": 0.8750434049308001, + "grad_norm": 29.245939319998083, + "learning_rate": 1.4229330935632069e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 3.9921875, + "logps/chosen": -766.0, + "logps/rejected": -622.0, + "loss": 0.3662, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.29296875, + "rewards/margins": 9.71875, + "rewards/rejected": -7.4296875, + "step": 4410 + }, + { + "epoch": 0.8752418274716007, + "grad_norm": 35.214526937465884, + "learning_rate": 1.4216149583350755e-07, + "logits/chosen": 3.796875, + "logits/rejected": 3.78125, + "logps/chosen": -1077.5, + "logps/rejected": -785.0, + "loss": 0.3586, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5390625, + "rewards/margins": 8.0546875, + "rewards/rejected": -5.5234375, + "step": 4411 + }, + { + "epoch": 0.8754402500124014, + "grad_norm": 35.16072201360534, + "learning_rate": 1.4202987794358024e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.265625, + "logps/chosen": -727.5, + "logps/rejected": -640.0, + "loss": 0.5136, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.4853515625, + "rewards/margins": 7.0546875, + "rewards/rejected": -5.578125, + "step": 4412 + }, + { + "epoch": 0.875638672553202, + "grad_norm": 32.01090822261441, + "learning_rate": 1.418984557496736e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.98828125, + "logps/chosen": -855.0, + "logps/rejected": -627.5, + "loss": 0.4546, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.3271484375, + "rewards/margins": 6.8515625, + "rewards/rejected": -4.521484375, + "step": 4413 + }, + { + "epoch": 0.8758370950940026, + "grad_norm": 34.72766751239485, + "learning_rate": 1.417672293148285e-07, + "logits/chosen": 4.32421875, + "logits/rejected": 4.4921875, + "logps/chosen": -1265.5, + "logps/rejected": -903.5, + "loss": 0.4298, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.404296875, + "rewards/margins": 7.8125, + "rewards/rejected": -5.4140625, + "step": 4414 + }, + { + "epoch": 0.8760355176348034, + "grad_norm": 44.06042085035199, + "learning_rate": 1.4163619870199188e-07, + "logits/chosen": 4.0625, + "logits/rejected": 3.921875, + "logps/chosen": -914.0, + "logps/rejected": -650.5, + "loss": 0.4287, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.814453125, + "rewards/margins": 7.72265625, + "rewards/rejected": -5.900390625, + "step": 4415 + }, + { + "epoch": 0.876233940175604, + "grad_norm": 30.5485721070378, + "learning_rate": 1.4150536397401705e-07, + "logits/chosen": 3.859375, + "logits/rejected": 3.7265625, + "logps/chosen": -1092.5, + "logps/rejected": -726.75, + "loss": 0.3558, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.04296875, + "rewards/margins": 9.3671875, + "rewards/rejected": -6.3046875, + "step": 4416 + }, + { + "epoch": 0.8764323627164046, + "grad_norm": 34.598395131429825, + "learning_rate": 1.4137472519366277e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 4.2421875, + "logps/chosen": -1080.0, + "logps/rejected": -624.5, + "loss": 0.335, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.544921875, + "rewards/margins": 7.421875, + "rewards/rejected": -4.8828125, + "step": 4417 + }, + { + "epoch": 0.8766307852572052, + "grad_norm": 37.336650818468335, + "learning_rate": 1.4124428242359433e-07, + "logits/chosen": 4.24609375, + "logits/rejected": 4.1171875, + "logps/chosen": -1086.0, + "logps/rejected": -941.0, + "loss": 0.387, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.390625, + "rewards/margins": 8.8359375, + "rewards/rejected": -5.44921875, + "step": 4418 + }, + { + "epoch": 0.8768292077980059, + "grad_norm": 37.96032919197077, + "learning_rate": 1.4111403572638282e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.09765625, + "logps/chosen": -1194.0, + "logps/rejected": -751.0, + "loss": 0.4527, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.44921875, + "rewards/margins": 7.111328125, + "rewards/rejected": -5.6484375, + "step": 4419 + }, + { + "epoch": 0.8770276303388065, + "grad_norm": 32.429642904168034, + "learning_rate": 1.4098398516450509e-07, + "logits/chosen": 3.8359375, + "logits/rejected": 3.80078125, + "logps/chosen": -1060.0, + "logps/rejected": -1066.0, + "loss": 0.2852, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.70068359375, + "rewards/margins": 10.4453125, + "rewards/rejected": -7.7421875, + "step": 4420 + }, + { + "epoch": 0.8772260528796071, + "grad_norm": 22.11363570757131, + "learning_rate": 1.408541308003444e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 3.84765625, + "logps/chosen": -1085.0, + "logps/rejected": -708.0, + "loss": 0.2719, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9375, + "rewards/margins": 14.4140625, + "rewards/rejected": -11.4765625, + "step": 4421 + }, + { + "epoch": 0.8774244754204078, + "grad_norm": 36.76339760877866, + "learning_rate": 1.407244726961892e-07, + "logits/chosen": 4.22265625, + "logits/rejected": 4.21875, + "logps/chosen": -987.0, + "logps/rejected": -701.0, + "loss": 0.3809, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.01171875, + "rewards/margins": 8.5546875, + "rewards/rejected": -5.5390625, + "step": 4422 + }, + { + "epoch": 0.8776228979612084, + "grad_norm": 37.08193875874208, + "learning_rate": 1.4059501091423448e-07, + "logits/chosen": 4.078125, + "logits/rejected": 3.91796875, + "logps/chosen": -944.5, + "logps/rejected": -927.0, + "loss": 0.3843, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.69921875, + "rewards/margins": 17.203125, + "rewards/rejected": -14.4921875, + "step": 4423 + }, + { + "epoch": 0.877821320502009, + "grad_norm": 28.783081569180716, + "learning_rate": 1.4046574551658068e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 4.0859375, + "logps/chosen": -767.0, + "logps/rejected": -650.0, + "loss": 0.533, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.9677734375, + "rewards/margins": 6.875, + "rewards/rejected": -4.89453125, + "step": 4424 + }, + { + "epoch": 0.8780197430428096, + "grad_norm": 38.1790217001061, + "learning_rate": 1.4033667656523404e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 3.65625, + "logps/chosen": -962.0, + "logps/rejected": -668.0, + "loss": 0.3647, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.64453125, + "rewards/margins": 7.859375, + "rewards/rejected": -5.22265625, + "step": 4425 + }, + { + "epoch": 0.8782181655836103, + "grad_norm": 27.112786959133288, + "learning_rate": 1.4020780412210672e-07, + "logits/chosen": 3.703125, + "logits/rejected": 3.58984375, + "logps/chosen": -835.0, + "logps/rejected": -576.0, + "loss": 0.317, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0078125, + "rewards/margins": 8.3359375, + "rewards/rejected": -5.3359375, + "step": 4426 + }, + { + "epoch": 0.8784165881244109, + "grad_norm": 25.53776597940363, + "learning_rate": 1.400791282490164e-07, + "logits/chosen": 4.4375, + "logits/rejected": 4.4375, + "logps/chosen": -822.5, + "logps/rejected": -879.0, + "loss": 0.32, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.525390625, + "rewards/margins": 11.1640625, + "rewards/rejected": -7.609375, + "step": 4427 + }, + { + "epoch": 0.8786150106652115, + "grad_norm": 25.210377927637097, + "learning_rate": 1.3995064900768683e-07, + "logits/chosen": 4.53125, + "logits/rejected": 4.5078125, + "logps/chosen": -1154.0, + "logps/rejected": -737.5, + "loss": 0.2568, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.56640625, + "rewards/margins": 9.546875, + "rewards/rejected": -5.984375, + "step": 4428 + }, + { + "epoch": 0.8788134332060122, + "grad_norm": 26.61554095446937, + "learning_rate": 1.398223664597471e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 3.90234375, + "logps/chosen": -782.0, + "logps/rejected": -836.5, + "loss": 0.4076, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.025390625, + "rewards/margins": 13.1875, + "rewards/rejected": -11.1875, + "step": 4429 + }, + { + "epoch": 0.8790118557468128, + "grad_norm": 30.247447379378727, + "learning_rate": 1.3969428066673205e-07, + "logits/chosen": 4.046875, + "logits/rejected": 4.1171875, + "logps/chosen": -995.0, + "logps/rejected": -701.0, + "loss": 0.3641, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8984375, + "rewards/margins": 8.1953125, + "rewards/rejected": -5.29296875, + "step": 4430 + }, + { + "epoch": 0.8792102782876134, + "grad_norm": 29.055992849285662, + "learning_rate": 1.3956639169008222e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.35546875, + "logps/chosen": -918.0, + "logps/rejected": -829.0, + "loss": 0.4858, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.45703125, + "rewards/margins": 6.25390625, + "rewards/rejected": -3.80859375, + "step": 4431 + }, + { + "epoch": 0.8794087008284142, + "grad_norm": 31.282919660568314, + "learning_rate": 1.394386995911437e-07, + "logits/chosen": 3.62890625, + "logits/rejected": 3.75390625, + "logps/chosen": -998.0, + "logps/rejected": -884.0, + "loss": 0.3355, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.34375, + "rewards/margins": 18.84375, + "rewards/rejected": -16.453125, + "step": 4432 + }, + { + "epoch": 0.8796071233692148, + "grad_norm": 30.54848693362261, + "learning_rate": 1.3931120443116798e-07, + "logits/chosen": 3.3828125, + "logits/rejected": 3.4140625, + "logps/chosen": -783.0, + "logps/rejected": -568.5, + "loss": 0.3613, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.9736328125, + "rewards/margins": 7.5390625, + "rewards/rejected": -5.55859375, + "step": 4433 + }, + { + "epoch": 0.8798055459100154, + "grad_norm": 29.69108825625417, + "learning_rate": 1.391839062713124e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.06640625, + "logps/chosen": -1312.0, + "logps/rejected": -896.0, + "loss": 0.2786, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.28759765625, + "rewards/margins": 11.1171875, + "rewards/rejected": -8.828125, + "step": 4434 + }, + { + "epoch": 0.880003968450816, + "grad_norm": 29.81217663811963, + "learning_rate": 1.3905680517263943e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.515625, + "logps/chosen": -885.0, + "logps/rejected": -759.0, + "loss": 0.3905, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.181640625, + "rewards/margins": 8.25, + "rewards/rejected": -6.0546875, + "step": 4435 + }, + { + "epoch": 0.8802023909916167, + "grad_norm": 35.73302585946708, + "learning_rate": 1.3892990119611735e-07, + "logits/chosen": 3.6484375, + "logits/rejected": 3.9296875, + "logps/chosen": -696.5, + "logps/rejected": -683.0, + "loss": 0.6086, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.60986328125, + "rewards/margins": 6.0546875, + "rewards/rejected": -5.4453125, + "step": 4436 + }, + { + "epoch": 0.8804008135324173, + "grad_norm": 26.697998879063274, + "learning_rate": 1.3880319440261982e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.6328125, + "logps/chosen": -1132.0, + "logps/rejected": -873.5, + "loss": 0.3885, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.783203125, + "rewards/margins": 8.0703125, + "rewards/rejected": -5.28125, + "step": 4437 + }, + { + "epoch": 0.8805992360732179, + "grad_norm": 38.76658139315939, + "learning_rate": 1.3867668485292566e-07, + "logits/chosen": 4.484375, + "logits/rejected": 4.421875, + "logps/chosen": -1036.0, + "logps/rejected": -885.0, + "loss": 0.2608, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.09765625, + "rewards/margins": 9.921875, + "rewards/rejected": -6.84375, + "step": 4438 + }, + { + "epoch": 0.8807976586140186, + "grad_norm": 27.501264910387842, + "learning_rate": 1.3855037260771936e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.8046875, + "logps/chosen": -1054.5, + "logps/rejected": -1172.5, + "loss": 0.2696, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.10546875, + "rewards/margins": 11.5234375, + "rewards/rejected": -8.40625, + "step": 4439 + }, + { + "epoch": 0.8809960811548192, + "grad_norm": 41.42511539092438, + "learning_rate": 1.384242577275905e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 3.96484375, + "logps/chosen": -803.5, + "logps/rejected": -702.0, + "loss": 0.4288, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.787109375, + "rewards/margins": 7.2421875, + "rewards/rejected": -5.453125, + "step": 4440 + }, + { + "epoch": 0.8811945036956198, + "grad_norm": 27.42763003071397, + "learning_rate": 1.3829834027303445e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.2578125, + "logps/chosen": -750.5, + "logps/rejected": -559.0, + "loss": 0.4229, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5703125, + "rewards/margins": 7.84375, + "rewards/rejected": -5.2890625, + "step": 4441 + }, + { + "epoch": 0.8813929262364204, + "grad_norm": 33.80260541257666, + "learning_rate": 1.3817262030445128e-07, + "logits/chosen": 4.296875, + "logits/rejected": 3.9296875, + "logps/chosen": -1307.0, + "logps/rejected": -893.0, + "loss": 0.3676, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6572265625, + "rewards/margins": 9.6953125, + "rewards/rejected": -7.046875, + "step": 4442 + }, + { + "epoch": 0.8815913487772211, + "grad_norm": 27.06150561617879, + "learning_rate": 1.380470978821468e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 4.03515625, + "logps/chosen": -937.0, + "logps/rejected": -836.0, + "loss": 0.3397, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7578125, + "rewards/margins": 9.4296875, + "rewards/rejected": -6.65234375, + "step": 4443 + }, + { + "epoch": 0.8817897713180217, + "grad_norm": 34.64718600910345, + "learning_rate": 1.3792177306633182e-07, + "logits/chosen": 3.875, + "logits/rejected": 4.20703125, + "logps/chosen": -970.0, + "logps/rejected": -1594.0, + "loss": 0.5193, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.955078125, + "rewards/margins": 8.796875, + "rewards/rejected": -5.8359375, + "step": 4444 + }, + { + "epoch": 0.8819881938588223, + "grad_norm": 27.071451265265324, + "learning_rate": 1.377966459171224e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.98828125, + "logps/chosen": -657.0, + "logps/rejected": -633.0, + "loss": 0.3603, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.455078125, + "rewards/margins": 9.1640625, + "rewards/rejected": -6.70703125, + "step": 4445 + }, + { + "epoch": 0.882186616399623, + "grad_norm": 31.87917148102313, + "learning_rate": 1.3767171649454e-07, + "logits/chosen": 4.5234375, + "logits/rejected": 4.5234375, + "logps/chosen": -1239.0, + "logps/rejected": -1278.0, + "loss": 0.3915, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.587890625, + "rewards/margins": 10.5703125, + "rewards/rejected": -6.98828125, + "step": 4446 + }, + { + "epoch": 0.8823850389404236, + "grad_norm": 35.75177995242273, + "learning_rate": 1.3754698485851072e-07, + "logits/chosen": 3.8359375, + "logits/rejected": 3.734375, + "logps/chosen": -1156.0, + "logps/rejected": -851.0, + "loss": 0.3629, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.14453125, + "rewards/margins": 7.9765625, + "rewards/rejected": -4.8515625, + "step": 4447 + }, + { + "epoch": 0.8825834614812242, + "grad_norm": 29.774060536909186, + "learning_rate": 1.3742245106886634e-07, + "logits/chosen": 4.046875, + "logits/rejected": 4.1328125, + "logps/chosen": -880.0, + "logps/rejected": -1510.0, + "loss": 0.3544, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4140625, + "rewards/margins": 12.5546875, + "rewards/rejected": -10.140625, + "step": 4448 + }, + { + "epoch": 0.882781884022025, + "grad_norm": 30.877341854562516, + "learning_rate": 1.372981151853435e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.06640625, + "logps/chosen": -1011.0, + "logps/rejected": -739.5, + "loss": 0.5194, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.107421875, + "rewards/margins": 7.84765625, + "rewards/rejected": -4.72265625, + "step": 4449 + }, + { + "epoch": 0.8829803065628256, + "grad_norm": 35.75504298707355, + "learning_rate": 1.371739772675839e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 4.07421875, + "logps/chosen": -919.0, + "logps/rejected": -834.0, + "loss": 0.4621, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.87890625, + "rewards/margins": 7.8671875, + "rewards/rejected": -5.984375, + "step": 4450 + }, + { + "epoch": 0.8831787291036262, + "grad_norm": 30.13965669610827, + "learning_rate": 1.3705003737513422e-07, + "logits/chosen": 4.37890625, + "logits/rejected": 4.46875, + "logps/chosen": -807.0, + "logps/rejected": -1139.0, + "loss": 0.4364, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.337890625, + "rewards/margins": 13.8828125, + "rewards/rejected": -11.546875, + "step": 4451 + }, + { + "epoch": 0.8833771516444268, + "grad_norm": 34.188842218854575, + "learning_rate": 1.3692629556744617e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.08203125, + "logps/chosen": -933.5, + "logps/rejected": -854.0, + "loss": 0.37, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3203125, + "rewards/margins": 9.046875, + "rewards/rejected": -6.734375, + "step": 4452 + }, + { + "epoch": 0.8835755741852275, + "grad_norm": 36.55453533280813, + "learning_rate": 1.3680275190387675e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 4.3515625, + "logps/chosen": -677.0, + "logps/rejected": -612.0, + "loss": 0.531, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.4423828125, + "rewards/margins": 6.25, + "rewards/rejected": -4.81640625, + "step": 4453 + }, + { + "epoch": 0.8837739967260281, + "grad_norm": 35.04330932627321, + "learning_rate": 1.366794064436874e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 3.89453125, + "logps/chosen": -932.5, + "logps/rejected": -643.0, + "loss": 0.5909, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.05224609375, + "rewards/margins": 5.625, + "rewards/rejected": -3.56640625, + "step": 4454 + }, + { + "epoch": 0.8839724192668287, + "grad_norm": 28.72756808843351, + "learning_rate": 1.3655625924604487e-07, + "logits/chosen": 3.70703125, + "logits/rejected": 4.1015625, + "logps/chosen": -1027.0, + "logps/rejected": -772.0, + "loss": 0.5441, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.0810546875, + "rewards/margins": 6.8125, + "rewards/rejected": -5.71875, + "step": 4455 + }, + { + "epoch": 0.8841708418076294, + "grad_norm": 30.047496284475166, + "learning_rate": 1.364333103700207e-07, + "logits/chosen": 4.25, + "logits/rejected": 3.859375, + "logps/chosen": -890.5, + "logps/rejected": -590.5, + "loss": 0.4587, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.28125, + "rewards/margins": 5.9375, + "rewards/rejected": -3.6640625, + "step": 4456 + }, + { + "epoch": 0.88436926434843, + "grad_norm": 40.32114667410826, + "learning_rate": 1.3631055987459124e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.6015625, + "logps/chosen": -727.0, + "logps/rejected": -587.0, + "loss": 0.3756, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.30078125, + "rewards/margins": 6.921875, + "rewards/rejected": -4.63671875, + "step": 4457 + }, + { + "epoch": 0.8845676868892306, + "grad_norm": 36.60221756607113, + "learning_rate": 1.3618800781863776e-07, + "logits/chosen": 3.7421875, + "logits/rejected": 3.6015625, + "logps/chosen": -1202.0, + "logps/rejected": -957.0, + "loss": 0.3083, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.3359375, + "rewards/margins": 10.390625, + "rewards/rejected": -7.0625, + "step": 4458 + }, + { + "epoch": 0.8847661094300312, + "grad_norm": 27.61777953079208, + "learning_rate": 1.3606565426094614e-07, + "logits/chosen": 4.26171875, + "logits/rejected": 4.35546875, + "logps/chosen": -1151.0, + "logps/rejected": -1911.0, + "loss": 0.2646, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.859375, + "rewards/margins": 13.3125, + "rewards/rejected": -10.453125, + "step": 4459 + }, + { + "epoch": 0.8849645319708319, + "grad_norm": 38.41639617602137, + "learning_rate": 1.3594349926020743e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.3671875, + "logps/chosen": -824.0, + "logps/rejected": -810.0, + "loss": 0.4268, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7578125, + "rewards/margins": 8.4453125, + "rewards/rejected": -5.677734375, + "step": 4460 + }, + { + "epoch": 0.8851629545116325, + "grad_norm": 36.213114603510206, + "learning_rate": 1.3582154287501716e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.59375, + "logps/chosen": -1075.0, + "logps/rejected": -1671.0, + "loss": 0.4515, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.265625, + "rewards/margins": 11.40625, + "rewards/rejected": -9.1171875, + "step": 4461 + }, + { + "epoch": 0.8853613770524331, + "grad_norm": 32.97837062971439, + "learning_rate": 1.356997851638756e-07, + "logits/chosen": 4.07421875, + "logits/rejected": 4.3359375, + "logps/chosen": -922.0, + "logps/rejected": -1562.0, + "loss": 0.2947, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7265625, + "rewards/margins": 11.6875, + "rewards/rejected": -8.96875, + "step": 4462 + }, + { + "epoch": 0.8855597995932338, + "grad_norm": 22.950993569467645, + "learning_rate": 1.3557822618518774e-07, + "logits/chosen": 3.8359375, + "logits/rejected": 3.7109375, + "logps/chosen": -1081.0, + "logps/rejected": -537.5, + "loss": 0.4246, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.091796875, + "rewards/margins": 8.32421875, + "rewards/rejected": -5.22265625, + "step": 4463 + }, + { + "epoch": 0.8857582221340344, + "grad_norm": 24.062237712430782, + "learning_rate": 1.354568659972633e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 3.83984375, + "logps/chosen": -924.0, + "logps/rejected": -796.5, + "loss": 0.3541, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.1669921875, + "rewards/margins": 9.5703125, + "rewards/rejected": -7.3828125, + "step": 4464 + }, + { + "epoch": 0.885956644674835, + "grad_norm": 26.26525935271901, + "learning_rate": 1.353357046583165e-07, + "logits/chosen": 3.74609375, + "logits/rejected": 3.51171875, + "logps/chosen": -1114.0, + "logps/rejected": -472.0, + "loss": 0.378, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.22265625, + "rewards/margins": 8.921875, + "rewards/rejected": -5.6953125, + "step": 4465 + }, + { + "epoch": 0.8861550672156357, + "grad_norm": 27.0942807298352, + "learning_rate": 1.3521474222646662e-07, + "logits/chosen": 3.85546875, + "logits/rejected": 3.70703125, + "logps/chosen": -1085.0, + "logps/rejected": -1285.75, + "loss": 0.2935, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.97265625, + "rewards/margins": 11.6953125, + "rewards/rejected": -8.68359375, + "step": 4466 + }, + { + "epoch": 0.8863534897564364, + "grad_norm": 25.594316995920753, + "learning_rate": 1.3509397875973673e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.13671875, + "logps/chosen": -1069.0, + "logps/rejected": -932.0, + "loss": 0.3801, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.30859375, + "rewards/margins": 10.15625, + "rewards/rejected": -7.828125, + "step": 4467 + }, + { + "epoch": 0.886551912297237, + "grad_norm": 24.373088996971315, + "learning_rate": 1.3497341431605522e-07, + "logits/chosen": 3.8359375, + "logits/rejected": 3.91015625, + "logps/chosen": -738.0, + "logps/rejected": -570.5, + "loss": 0.5266, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.69189453125, + "rewards/margins": 6.3515625, + "rewards/rejected": -4.65234375, + "step": 4468 + }, + { + "epoch": 0.8867503348380376, + "grad_norm": 28.184570490416842, + "learning_rate": 1.3485304895325462e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.01171875, + "logps/chosen": -965.0, + "logps/rejected": -591.0, + "loss": 0.3852, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.34765625, + "rewards/margins": 8.25, + "rewards/rejected": -4.8984375, + "step": 4469 + }, + { + "epoch": 0.8869487573788383, + "grad_norm": 26.631730592261192, + "learning_rate": 1.34732882729072e-07, + "logits/chosen": 3.58984375, + "logits/rejected": 3.7890625, + "logps/chosen": -1235.0, + "logps/rejected": -1560.0, + "loss": 0.3305, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.248046875, + "rewards/margins": 11.671875, + "rewards/rejected": -9.3984375, + "step": 4470 + }, + { + "epoch": 0.8871471799196389, + "grad_norm": 26.308266312789822, + "learning_rate": 1.3461291570114918e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 4.0703125, + "logps/chosen": -1025.0, + "logps/rejected": -853.0, + "loss": 0.3372, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.83203125, + "rewards/margins": 9.828125, + "rewards/rejected": -7.0078125, + "step": 4471 + }, + { + "epoch": 0.8873456024604395, + "grad_norm": 36.70948455325221, + "learning_rate": 1.3449314792703192e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.328125, + "logps/chosen": -1095.0, + "logps/rejected": -1002.0, + "loss": 0.333, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.658203125, + "rewards/margins": 9.359375, + "rewards/rejected": -6.6953125, + "step": 4472 + }, + { + "epoch": 0.8875440250012402, + "grad_norm": 38.11731365940267, + "learning_rate": 1.3437357946417103e-07, + "logits/chosen": 3.71484375, + "logits/rejected": 3.953125, + "logps/chosen": -811.0, + "logps/rejected": -546.5, + "loss": 0.4623, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.0849609375, + "rewards/margins": 7.5390625, + "rewards/rejected": -5.453125, + "step": 4473 + }, + { + "epoch": 0.8877424475420408, + "grad_norm": 32.673529401315555, + "learning_rate": 1.3425421036992097e-07, + "logits/chosen": 3.8828125, + "logits/rejected": 4.06640625, + "logps/chosen": -933.0, + "logps/rejected": -790.0, + "loss": 0.4195, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.41015625, + "rewards/margins": 7.4296875, + "rewards/rejected": -5.0078125, + "step": 4474 + }, + { + "epoch": 0.8879408700828414, + "grad_norm": 33.65617061146954, + "learning_rate": 1.3413504070154124e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.171875, + "logps/chosen": -929.0, + "logps/rejected": -698.5, + "loss": 0.324, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.990234375, + "rewards/margins": 8.9140625, + "rewards/rejected": -5.92578125, + "step": 4475 + }, + { + "epoch": 0.888139292623642, + "grad_norm": 29.307936100121104, + "learning_rate": 1.3401607051619545e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.29296875, + "logps/chosen": -962.0, + "logps/rejected": -890.0, + "loss": 0.2309, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.44921875, + "rewards/margins": 9.5625, + "rewards/rejected": -6.09375, + "step": 4476 + }, + { + "epoch": 0.8883377151644427, + "grad_norm": 33.780342212854606, + "learning_rate": 1.3389729987095133e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 4.16015625, + "logps/chosen": -889.0, + "logps/rejected": -856.0, + "loss": 0.4486, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.927734375, + "rewards/margins": 8.03125, + "rewards/rejected": -6.12109375, + "step": 4477 + }, + { + "epoch": 0.8885361377052433, + "grad_norm": 33.56562109703322, + "learning_rate": 1.337787288227812e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 3.8671875, + "logps/chosen": -1113.0, + "logps/rejected": -729.0, + "loss": 0.4512, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8515625, + "rewards/margins": 7.52734375, + "rewards/rejected": -4.666015625, + "step": 4478 + }, + { + "epoch": 0.8887345602460439, + "grad_norm": 40.65883689606129, + "learning_rate": 1.3366035742856123e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 4.1953125, + "logps/chosen": -787.0, + "logps/rejected": -826.0, + "loss": 0.3404, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.982177734375, + "rewards/margins": 9.3359375, + "rewards/rejected": -7.359375, + "step": 4479 + }, + { + "epoch": 0.8889329827868446, + "grad_norm": 24.475572545350985, + "learning_rate": 1.335421857450724e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 4.01171875, + "logps/chosen": -850.5, + "logps/rejected": -2050.5, + "loss": 0.4763, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.03662109375, + "rewards/margins": 10.1953125, + "rewards/rejected": -8.15234375, + "step": 4480 + }, + { + "epoch": 0.8891314053276452, + "grad_norm": 37.43653846316377, + "learning_rate": 1.3342421382899935e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.16015625, + "logps/chosen": -948.5, + "logps/rejected": -917.5, + "loss": 0.4317, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.48046875, + "rewards/margins": 5.8359375, + "rewards/rejected": -4.35546875, + "step": 4481 + }, + { + "epoch": 0.8893298278684458, + "grad_norm": 29.83787540462214, + "learning_rate": 1.333064417369311e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.2578125, + "logps/chosen": -1432.0, + "logps/rejected": -811.0, + "loss": 0.3587, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.43359375, + "rewards/margins": 8.375, + "rewards/rejected": -4.9375, + "step": 4482 + }, + { + "epoch": 0.8895282504092465, + "grad_norm": 33.34906265939804, + "learning_rate": 1.3318886952536114e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 3.76953125, + "logps/chosen": -1352.0, + "logps/rejected": -1658.0, + "loss": 0.2361, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.515625, + "rewards/margins": 12.0078125, + "rewards/rejected": -8.515625, + "step": 4483 + }, + { + "epoch": 0.8897266729500471, + "grad_norm": 38.23681995312532, + "learning_rate": 1.330714972506863e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 3.8515625, + "logps/chosen": -781.0, + "logps/rejected": -536.5, + "loss": 0.4182, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.875030517578125, + "rewards/margins": 6.24609375, + "rewards/rejected": -4.37109375, + "step": 4484 + }, + { + "epoch": 0.8899250954908478, + "grad_norm": 35.748661460647554, + "learning_rate": 1.3295432496920832e-07, + "logits/chosen": 3.73828125, + "logits/rejected": 3.50390625, + "logps/chosen": -1319.0, + "logps/rejected": -942.0, + "loss": 0.3164, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.578125, + "rewards/margins": 8.15625, + "rewards/rejected": -5.578125, + "step": 4485 + }, + { + "epoch": 0.8901235180316484, + "grad_norm": 35.102301766311435, + "learning_rate": 1.3283735273713253e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.921875, + "logps/chosen": -1139.5, + "logps/rejected": -1575.5, + "loss": 0.3579, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.447265625, + "rewards/margins": 10.3984375, + "rewards/rejected": -7.96484375, + "step": 4486 + }, + { + "epoch": 0.8903219405724491, + "grad_norm": 31.865219566313428, + "learning_rate": 1.327205806105685e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.23828125, + "logps/chosen": -1235.0, + "logps/rejected": -760.0, + "loss": 0.4149, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0859375, + "rewards/margins": 9.09375, + "rewards/rejected": -5.9921875, + "step": 4487 + }, + { + "epoch": 0.8905203631132497, + "grad_norm": 27.362324412311924, + "learning_rate": 1.3260400864552968e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.1640625, + "logps/chosen": -950.0, + "logps/rejected": -843.0, + "loss": 0.4648, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.61328125, + "rewards/margins": 7.9609375, + "rewards/rejected": -5.3515625, + "step": 4488 + }, + { + "epoch": 0.8907187856540503, + "grad_norm": 24.989611911653093, + "learning_rate": 1.324876368979336e-07, + "logits/chosen": 3.67578125, + "logits/rejected": 3.56640625, + "logps/chosen": -1155.0, + "logps/rejected": -830.0, + "loss": 0.337, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.044921875, + "rewards/margins": 8.421875, + "rewards/rejected": -5.3828125, + "step": 4489 + }, + { + "epoch": 0.890917208194851, + "grad_norm": 34.61508085774431, + "learning_rate": 1.323714654236016e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 4.078125, + "logps/chosen": -1036.0, + "logps/rejected": -800.0, + "loss": 0.4009, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0703125, + "rewards/margins": 7.8359375, + "rewards/rejected": -5.77734375, + "step": 4490 + }, + { + "epoch": 0.8911156307356516, + "grad_norm": 30.35404332660595, + "learning_rate": 1.322554942782593e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.09375, + "logps/chosen": -1055.5, + "logps/rejected": -675.5, + "loss": 0.3757, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.458984375, + "rewards/margins": 8.4140625, + "rewards/rejected": -4.94921875, + "step": 4491 + }, + { + "epoch": 0.8913140532764522, + "grad_norm": 35.94807124015238, + "learning_rate": 1.321397235175359e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.4921875, + "logps/chosen": -1231.0, + "logps/rejected": -1057.0, + "loss": 0.4543, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.5224609375, + "rewards/margins": 8.203125, + "rewards/rejected": -5.6796875, + "step": 4492 + }, + { + "epoch": 0.8915124758172528, + "grad_norm": 31.573273748182014, + "learning_rate": 1.3202415319696453e-07, + "logits/chosen": 4.484375, + "logits/rejected": 3.97265625, + "logps/chosen": -694.0, + "logps/rejected": -495.5, + "loss": 0.3504, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5732421875, + "rewards/margins": 7.0625, + "rewards/rejected": -4.5, + "step": 4493 + }, + { + "epoch": 0.8917108983580535, + "grad_norm": 42.05748181986973, + "learning_rate": 1.319087833719823e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 4.3828125, + "logps/chosen": -846.5, + "logps/rejected": -1383.0, + "loss": 0.4119, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.326171875, + "rewards/margins": 11.1953125, + "rewards/rejected": -8.890625, + "step": 4494 + }, + { + "epoch": 0.8919093208988541, + "grad_norm": 26.526196946169982, + "learning_rate": 1.3179361409792984e-07, + "logits/chosen": 3.85546875, + "logits/rejected": 3.8671875, + "logps/chosen": -987.0, + "logps/rejected": -623.0, + "loss": 0.342, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4599609375, + "rewards/margins": 9.03125, + "rewards/rejected": -6.578125, + "step": 4495 + }, + { + "epoch": 0.8921077434396547, + "grad_norm": 27.16439168360375, + "learning_rate": 1.3167864543005219e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.4296875, + "logps/chosen": -862.0, + "logps/rejected": -553.0, + "loss": 0.3534, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.76953125, + "rewards/margins": 8.015625, + "rewards/rejected": -5.234375, + "step": 4496 + }, + { + "epoch": 0.8923061659804554, + "grad_norm": 23.88013684137119, + "learning_rate": 1.3156387742349745e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.171875, + "logps/chosen": -976.0, + "logps/rejected": -845.0, + "loss": 0.3094, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.28515625, + "rewards/margins": 11.109375, + "rewards/rejected": -7.84375, + "step": 4497 + }, + { + "epoch": 0.892504588521256, + "grad_norm": 42.42758361283756, + "learning_rate": 1.3144931013331797e-07, + "logits/chosen": 4.6015625, + "logits/rejected": 4.4921875, + "logps/chosen": -963.0, + "logps/rejected": -587.5, + "loss": 0.4626, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.71875, + "rewards/margins": 6.4375, + "rewards/rejected": -4.71875, + "step": 4498 + }, + { + "epoch": 0.8927030110620566, + "grad_norm": 27.245762097270305, + "learning_rate": 1.313349436144696e-07, + "logits/chosen": 4.53125, + "logits/rejected": 4.26953125, + "logps/chosen": -628.0, + "logps/rejected": -480.0, + "loss": 0.4351, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.5234375, + "rewards/margins": 6.390625, + "rewards/rejected": -4.875, + "step": 4499 + }, + { + "epoch": 0.8929014336028572, + "grad_norm": 45.80861620774116, + "learning_rate": 1.3122077792181192e-07, + "logits/chosen": 3.50390625, + "logits/rejected": 3.2734375, + "logps/chosen": -907.0, + "logps/rejected": -617.5, + "loss": 0.4333, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.830078125, + "rewards/margins": 7.25, + "rewards/rejected": -5.4375, + "step": 4500 + }, + { + "epoch": 0.8930998561436579, + "grad_norm": 31.69557606790446, + "learning_rate": 1.3110681311010814e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.15234375, + "logps/chosen": -938.0, + "logps/rejected": -1608.0, + "loss": 0.527, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.06494140625, + "rewards/margins": 8.15625, + "rewards/rejected": -6.0859375, + "step": 4501 + }, + { + "epoch": 0.8932982786844585, + "grad_norm": 40.63142277321202, + "learning_rate": 1.309930492340251e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 3.453125, + "logps/chosen": -742.0, + "logps/rejected": -868.0, + "loss": 0.4236, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.822265625, + "rewards/margins": 7.0625, + "rewards/rejected": -5.234375, + "step": 4502 + }, + { + "epoch": 0.8934967012252591, + "grad_norm": 29.203285113233076, + "learning_rate": 1.3087948634813362e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 3.92578125, + "logps/chosen": -1270.0, + "logps/rejected": -949.0, + "loss": 0.3461, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.3251953125, + "rewards/margins": 8.8125, + "rewards/rejected": -6.46875, + "step": 4503 + }, + { + "epoch": 0.8936951237660599, + "grad_norm": 38.70171241897112, + "learning_rate": 1.3076612450690742e-07, + "logits/chosen": 3.5390625, + "logits/rejected": 3.55859375, + "logps/chosen": -634.0, + "logps/rejected": -648.5, + "loss": 0.4612, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.45550537109375, + "rewards/margins": 6.3828125, + "rewards/rejected": -4.92578125, + "step": 4504 + }, + { + "epoch": 0.8938935463068605, + "grad_norm": 18.44209946323619, + "learning_rate": 1.306529637647244e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.1796875, + "logps/chosen": -1062.0, + "logps/rejected": -648.5, + "loss": 0.3492, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.197265625, + "rewards/margins": 8.328125, + "rewards/rejected": -5.125, + "step": 4505 + }, + { + "epoch": 0.8940919688476611, + "grad_norm": 28.859649679004026, + "learning_rate": 1.3054000417586568e-07, + "logits/chosen": 4.359375, + "logits/rejected": 4.0625, + "logps/chosen": -984.0, + "logps/rejected": -802.5, + "loss": 0.5023, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6708984375, + "rewards/margins": 7.359375, + "rewards/rejected": -4.6875, + "step": 4506 + }, + { + "epoch": 0.8942903913884618, + "grad_norm": 30.54017031972005, + "learning_rate": 1.304272457945159e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 3.515625, + "logps/chosen": -809.0, + "logps/rejected": -713.5, + "loss": 0.4442, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.2353515625, + "rewards/margins": 7.7109375, + "rewards/rejected": -5.484375, + "step": 4507 + }, + { + "epoch": 0.8944888139292624, + "grad_norm": 38.64837856846549, + "learning_rate": 1.3031468867476348e-07, + "logits/chosen": 3.79296875, + "logits/rejected": 3.73828125, + "logps/chosen": -1240.0, + "logps/rejected": -792.0, + "loss": 0.433, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.462890625, + "rewards/margins": 7.546875, + "rewards/rejected": -5.078125, + "step": 4508 + }, + { + "epoch": 0.894687236470063, + "grad_norm": 25.79464209287623, + "learning_rate": 1.3020233287059976e-07, + "logits/chosen": 4.29296875, + "logits/rejected": 3.82421875, + "logps/chosen": -1013.0, + "logps/rejected": -741.0, + "loss": 0.4376, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.345703125, + "rewards/margins": 7.1796875, + "rewards/rejected": -4.82421875, + "step": 4509 + }, + { + "epoch": 0.8948856590108636, + "grad_norm": 33.900798176812785, + "learning_rate": 1.3009017843592006e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.76953125, + "logps/chosen": -1097.0, + "logps/rejected": -701.0, + "loss": 0.3768, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.234375, + "rewards/margins": 8.1796875, + "rewards/rejected": -5.94140625, + "step": 4510 + }, + { + "epoch": 0.8950840815516643, + "grad_norm": 43.98967602321872, + "learning_rate": 1.299782254245228e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.09375, + "logps/chosen": -929.0, + "logps/rejected": -851.0, + "loss": 0.4251, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.373046875, + "rewards/margins": 6.7421875, + "rewards/rejected": -4.36328125, + "step": 4511 + }, + { + "epoch": 0.8952825040924649, + "grad_norm": 33.0319097902546, + "learning_rate": 1.2986647389010982e-07, + "logits/chosen": 3.875, + "logits/rejected": 4.2890625, + "logps/chosen": -1095.0, + "logps/rejected": -912.0, + "loss": 0.4384, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.921875, + "rewards/margins": 8.953125, + "rewards/rejected": -6.03515625, + "step": 4512 + }, + { + "epoch": 0.8954809266332655, + "grad_norm": 24.61901020628586, + "learning_rate": 1.2975492388628638e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.07421875, + "logps/chosen": -853.0, + "logps/rejected": -813.0, + "loss": 0.3189, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8232421875, + "rewards/margins": 7.6875, + "rewards/rejected": -4.859375, + "step": 4513 + }, + { + "epoch": 0.8956793491740662, + "grad_norm": 37.70192217004799, + "learning_rate": 1.2964357546656095e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 3.796875, + "logps/chosen": -1013.0, + "logps/rejected": -1020.0, + "loss": 0.4216, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.171875, + "rewards/margins": 8.1796875, + "rewards/rejected": -6.0078125, + "step": 4514 + }, + { + "epoch": 0.8958777717148668, + "grad_norm": 33.504697512655824, + "learning_rate": 1.2953242868434554e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.0625, + "logps/chosen": -614.5, + "logps/rejected": -553.5, + "loss": 0.5207, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.33935546875, + "rewards/margins": 6.2578125, + "rewards/rejected": -4.923828125, + "step": 4515 + }, + { + "epoch": 0.8960761942556674, + "grad_norm": 35.379592287740806, + "learning_rate": 1.2942148359295523e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 4.0, + "logps/chosen": -1301.0, + "logps/rejected": -817.0, + "loss": 0.3524, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.330078125, + "rewards/margins": 7.78515625, + "rewards/rejected": -5.47265625, + "step": 4516 + }, + { + "epoch": 0.896274616796468, + "grad_norm": 28.816492888422022, + "learning_rate": 1.2931074024560834e-07, + "logits/chosen": 4.5546875, + "logits/rejected": 4.33984375, + "logps/chosen": -1345.0, + "logps/rejected": -826.0, + "loss": 0.3857, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.849609375, + "rewards/margins": 9.6796875, + "rewards/rejected": -6.83203125, + "step": 4517 + }, + { + "epoch": 0.8964730393372687, + "grad_norm": 30.177728663128487, + "learning_rate": 1.2920019869542657e-07, + "logits/chosen": 3.76953125, + "logits/rejected": 4.0859375, + "logps/chosen": -1038.0, + "logps/rejected": -691.0, + "loss": 0.4812, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.689453125, + "rewards/margins": 7.703125, + "rewards/rejected": -6.0078125, + "step": 4518 + }, + { + "epoch": 0.8966714618780693, + "grad_norm": 31.646755783731756, + "learning_rate": 1.290898589954347e-07, + "logits/chosen": 4.046875, + "logits/rejected": 3.83984375, + "logps/chosen": -1015.0, + "logps/rejected": -589.5, + "loss": 0.385, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.013671875, + "rewards/margins": 8.484375, + "rewards/rejected": -5.484375, + "step": 4519 + }, + { + "epoch": 0.89686988441887, + "grad_norm": 40.69888766516743, + "learning_rate": 1.289797211985607e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.4765625, + "logps/chosen": -842.0, + "logps/rejected": -1481.0, + "loss": 0.4823, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.615234375, + "rewards/margins": 10.015625, + "rewards/rejected": -7.39453125, + "step": 4520 + }, + { + "epoch": 0.8970683069596707, + "grad_norm": 33.60111993626845, + "learning_rate": 1.2886978535763576e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 3.6953125, + "logps/chosen": -697.5, + "logps/rejected": -1515.0, + "loss": 0.539, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.3193359375, + "rewards/margins": 7.5703125, + "rewards/rejected": -6.26171875, + "step": 4521 + }, + { + "epoch": 0.8972667295004713, + "grad_norm": 31.549133434911983, + "learning_rate": 1.2876005152539407e-07, + "logits/chosen": 3.8828125, + "logits/rejected": 4.00390625, + "logps/chosen": -1042.0, + "logps/rejected": -634.0, + "loss": 0.4648, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.580078125, + "rewards/margins": 7.1875, + "rewards/rejected": -5.609375, + "step": 4522 + }, + { + "epoch": 0.8974651520412719, + "grad_norm": 29.92547457342646, + "learning_rate": 1.286505197544731e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.59765625, + "logps/chosen": -1077.0, + "logps/rejected": -1268.5, + "loss": 0.4166, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5078125, + "rewards/margins": 10.125, + "rewards/rejected": -7.6328125, + "step": 4523 + }, + { + "epoch": 0.8976635745820726, + "grad_norm": 40.75389499003662, + "learning_rate": 1.285411900974133e-07, + "logits/chosen": 3.8046875, + "logits/rejected": 3.7734375, + "logps/chosen": -906.0, + "logps/rejected": -666.5, + "loss": 0.4662, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8212890625, + "rewards/margins": 7.125, + "rewards/rejected": -5.30078125, + "step": 4524 + }, + { + "epoch": 0.8978619971228732, + "grad_norm": 18.767533553727475, + "learning_rate": 1.284320626066581e-07, + "logits/chosen": 3.76953125, + "logits/rejected": 3.625, + "logps/chosen": -959.0, + "logps/rejected": -637.5, + "loss": 0.243, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1875, + "rewards/margins": 9.7734375, + "rewards/rejected": -6.5859375, + "step": 4525 + }, + { + "epoch": 0.8980604196636738, + "grad_norm": 37.15177434996455, + "learning_rate": 1.2832313733455408e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.7578125, + "logps/chosen": -1055.0, + "logps/rejected": -751.0, + "loss": 0.4058, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.95263671875, + "rewards/margins": 7.53125, + "rewards/rejected": -5.578125, + "step": 4526 + }, + { + "epoch": 0.8982588422044744, + "grad_norm": 36.64685928287343, + "learning_rate": 1.2821441433335062e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 4.08203125, + "logps/chosen": -817.5, + "logps/rejected": -667.5, + "loss": 0.3406, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.625, + "rewards/margins": 8.6171875, + "rewards/rejected": -5.9765625, + "step": 4527 + }, + { + "epoch": 0.8984572647452751, + "grad_norm": 34.45895625180723, + "learning_rate": 1.2810589365520042e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.7890625, + "logps/chosen": -1034.0, + "logps/rejected": -700.0, + "loss": 0.3786, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.59765625, + "rewards/margins": 8.5234375, + "rewards/rejected": -5.921875, + "step": 4528 + }, + { + "epoch": 0.8986556872860757, + "grad_norm": 31.511139758473497, + "learning_rate": 1.2799757535215869e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.8984375, + "logps/chosen": -1135.0, + "logps/rejected": -1119.0, + "loss": 0.4225, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.263671875, + "rewards/margins": 9.2109375, + "rewards/rejected": -6.96875, + "step": 4529 + }, + { + "epoch": 0.8988541098268763, + "grad_norm": 32.33501450088089, + "learning_rate": 1.2788945947618401e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.12890625, + "logps/chosen": -975.0, + "logps/rejected": -770.0, + "loss": 0.4797, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.783203125, + "rewards/margins": 7.5078125, + "rewards/rejected": -4.73828125, + "step": 4530 + }, + { + "epoch": 0.899052532367677, + "grad_norm": 27.16786622476645, + "learning_rate": 1.2778154607913754e-07, + "logits/chosen": 4.015625, + "logits/rejected": 3.87109375, + "logps/chosen": -553.5, + "logps/rejected": -517.5, + "loss": 0.3015, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.37109375, + "rewards/margins": 9.34375, + "rewards/rejected": -6.9609375, + "step": 4531 + }, + { + "epoch": 0.8992509549084776, + "grad_norm": 27.614261774571165, + "learning_rate": 1.2767383521278338e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 3.6875, + "logps/chosen": -846.0, + "logps/rejected": -1029.0, + "loss": 0.5429, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5029296875, + "rewards/margins": 11.6044921875, + "rewards/rejected": -9.1162109375, + "step": 4532 + }, + { + "epoch": 0.8994493774492782, + "grad_norm": 32.02168086006903, + "learning_rate": 1.2756632692878868e-07, + "logits/chosen": 3.515625, + "logits/rejected": 3.70703125, + "logps/chosen": -801.0, + "logps/rejected": -770.0, + "loss": 0.4782, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.125, + "rewards/margins": 7.4140625, + "rewards/rejected": -5.2890625, + "step": 4533 + }, + { + "epoch": 0.8996477999900788, + "grad_norm": 31.525241063103078, + "learning_rate": 1.2745902127872302e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.19140625, + "logps/chosen": -1250.0, + "logps/rejected": -935.0, + "loss": 0.3168, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.875, + "rewards/margins": 10.796875, + "rewards/rejected": -7.90625, + "step": 4534 + }, + { + "epoch": 0.8998462225308795, + "grad_norm": 27.91974458954146, + "learning_rate": 1.2735191831405923e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.0390625, + "logps/chosen": -961.0, + "logps/rejected": -795.0, + "loss": 0.2541, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.390625, + "rewards/margins": 10.109375, + "rewards/rejected": -6.71875, + "step": 4535 + }, + { + "epoch": 0.9000446450716801, + "grad_norm": 28.430373221849933, + "learning_rate": 1.272450180861726e-07, + "logits/chosen": 4.22265625, + "logits/rejected": 4.16796875, + "logps/chosen": -1189.0, + "logps/rejected": -783.0, + "loss": 0.2791, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.05859375, + "rewards/margins": 9.5625, + "rewards/rejected": -6.484375, + "step": 4536 + }, + { + "epoch": 0.9002430676124807, + "grad_norm": 32.597870072350496, + "learning_rate": 1.2713832064634125e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.203125, + "logps/chosen": -906.0, + "logps/rejected": -767.5, + "loss": 0.3646, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.560546875, + "rewards/margins": 9.69140625, + "rewards/rejected": -7.1123046875, + "step": 4537 + }, + { + "epoch": 0.9004414901532815, + "grad_norm": 26.977569737006608, + "learning_rate": 1.2703182604574608e-07, + "logits/chosen": 3.86328125, + "logits/rejected": 3.9296875, + "logps/chosen": -1089.0, + "logps/rejected": -816.0, + "loss": 0.3525, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.89453125, + "rewards/margins": 8.8125, + "rewards/rejected": -5.921875, + "step": 4538 + }, + { + "epoch": 0.9006399126940821, + "grad_norm": 43.44436922319208, + "learning_rate": 1.2692553433547064e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 3.8203125, + "logps/chosen": -1064.0, + "logps/rejected": -754.5, + "loss": 0.4725, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.37109375, + "rewards/margins": 6.7578125, + "rewards/rejected": -4.38671875, + "step": 4539 + }, + { + "epoch": 0.9008383352348827, + "grad_norm": 26.806878683668145, + "learning_rate": 1.268194455665013e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 4.1796875, + "logps/chosen": -1093.0, + "logps/rejected": -806.0, + "loss": 0.2605, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.25390625, + "rewards/margins": 10.703125, + "rewards/rejected": -7.4453125, + "step": 4540 + }, + { + "epoch": 0.9010367577756834, + "grad_norm": 47.784820964145986, + "learning_rate": 1.2671355978972672e-07, + "logits/chosen": 4.21484375, + "logits/rejected": 4.3125, + "logps/chosen": -899.0, + "logps/rejected": -808.0, + "loss": 0.3416, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.177734375, + "rewards/margins": 9.15625, + "rewards/rejected": -5.98046875, + "step": 4541 + }, + { + "epoch": 0.901235180316484, + "grad_norm": 31.77447392889085, + "learning_rate": 1.266078770559386e-07, + "logits/chosen": 4.23046875, + "logits/rejected": 4.21875, + "logps/chosen": -1302.0, + "logps/rejected": -856.0, + "loss": 0.3242, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.6875, + "rewards/margins": 9.3359375, + "rewards/rejected": -5.634765625, + "step": 4542 + }, + { + "epoch": 0.9014336028572846, + "grad_norm": 25.84092126286967, + "learning_rate": 1.2650239741583104e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 3.94921875, + "logps/chosen": -840.0, + "logps/rejected": -590.0, + "loss": 0.418, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.65625, + "rewards/margins": 7.03125, + "rewards/rejected": -4.3828125, + "step": 4543 + }, + { + "epoch": 0.9016320253980852, + "grad_norm": 32.15229747206022, + "learning_rate": 1.2639712092000073e-07, + "logits/chosen": 3.7109375, + "logits/rejected": 3.83984375, + "logps/chosen": -1120.0, + "logps/rejected": -1661.0, + "loss": 0.3912, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.04296875, + "rewards/margins": 10.9609375, + "rewards/rejected": -7.9140625, + "step": 4544 + }, + { + "epoch": 0.9018304479388859, + "grad_norm": 30.12389963406728, + "learning_rate": 1.2629204761894697e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.23046875, + "logps/chosen": -1067.0, + "logps/rejected": -761.5, + "loss": 0.3452, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9609375, + "rewards/margins": 8.5390625, + "rewards/rejected": -5.5625, + "step": 4545 + }, + { + "epoch": 0.9020288704796865, + "grad_norm": 27.4923907353277, + "learning_rate": 1.2618717756307142e-07, + "logits/chosen": 4.359375, + "logits/rejected": 4.3046875, + "logps/chosen": -767.0, + "logps/rejected": -588.5, + "loss": 0.4367, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.78515625, + "rewards/margins": 6.84375, + "rewards/rejected": -5.06640625, + "step": 4546 + }, + { + "epoch": 0.9022272930204871, + "grad_norm": 28.34800078087179, + "learning_rate": 1.2608251080267862e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.43359375, + "logps/chosen": -563.5, + "logps/rejected": -589.5, + "loss": 0.3839, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.076171875, + "rewards/margins": 6.1953125, + "rewards/rejected": -4.11328125, + "step": 4547 + }, + { + "epoch": 0.9024257155612878, + "grad_norm": 25.16166604212153, + "learning_rate": 1.2597804738797516e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 3.875, + "logps/chosen": -1192.0, + "logps/rejected": -766.0, + "loss": 0.2945, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.017578125, + "rewards/margins": 8.9375, + "rewards/rejected": -5.9140625, + "step": 4548 + }, + { + "epoch": 0.9026241381020884, + "grad_norm": 24.71054176682531, + "learning_rate": 1.2587378736907033e-07, + "logits/chosen": 3.87109375, + "logits/rejected": 3.953125, + "logps/chosen": -846.0, + "logps/rejected": -626.0, + "loss": 0.4634, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.345703125, + "rewards/margins": 6.53125, + "rewards/rejected": -4.18359375, + "step": 4549 + }, + { + "epoch": 0.902822560642889, + "grad_norm": 29.871485473560153, + "learning_rate": 1.2576973079597589e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 3.9296875, + "logps/chosen": -1110.0, + "logps/rejected": -1151.0, + "loss": 0.3572, + "rewards/accuracies": 0.90625, + "rewards/chosen": 4.1328125, + "rewards/margins": 11.4921875, + "rewards/rejected": -7.375, + "step": 4550 + }, + { + "epoch": 0.9030209831836896, + "grad_norm": 34.143385570875814, + "learning_rate": 1.2566587771860588e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 4.12890625, + "logps/chosen": -1013.0, + "logps/rejected": -838.0, + "loss": 0.3787, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.248046875, + "rewards/margins": 9.6171875, + "rewards/rejected": -7.3671875, + "step": 4551 + }, + { + "epoch": 0.9032194057244903, + "grad_norm": 27.686079451035617, + "learning_rate": 1.2556222818677662e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 4.03125, + "logps/chosen": -1217.0, + "logps/rejected": -822.0, + "loss": 0.3027, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.5, + "rewards/margins": 10.484375, + "rewards/rejected": -6.96875, + "step": 4552 + }, + { + "epoch": 0.9034178282652909, + "grad_norm": 33.587810922221706, + "learning_rate": 1.2545878225020727e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 3.87890625, + "logps/chosen": -1023.0, + "logps/rejected": -709.0, + "loss": 0.3787, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.376953125, + "rewards/margins": 8.6875, + "rewards/rejected": -6.328125, + "step": 4553 + }, + { + "epoch": 0.9036162508060915, + "grad_norm": 27.9902362102534, + "learning_rate": 1.2535553995851868e-07, + "logits/chosen": 3.83203125, + "logits/rejected": 3.66015625, + "logps/chosen": -767.0, + "logps/rejected": -749.0, + "loss": 0.4088, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.9140625, + "rewards/margins": 8.109375, + "rewards/rejected": -6.203125, + "step": 4554 + }, + { + "epoch": 0.9038146733468923, + "grad_norm": 25.05065938696203, + "learning_rate": 1.2525250136123457e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 4.1953125, + "logps/chosen": -813.0, + "logps/rejected": -1178.0, + "loss": 0.2691, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.984375, + "rewards/margins": 11.40625, + "rewards/rejected": -8.421875, + "step": 4555 + }, + { + "epoch": 0.9040130958876929, + "grad_norm": 30.879931697772946, + "learning_rate": 1.251496665077807e-07, + "logits/chosen": 4.015625, + "logits/rejected": 4.04296875, + "logps/chosen": -1005.0, + "logps/rejected": -1273.5, + "loss": 0.3656, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.640625, + "rewards/margins": 9.9296875, + "rewards/rejected": -7.30859375, + "step": 4556 + }, + { + "epoch": 0.9042115184284935, + "grad_norm": 25.8359299995658, + "learning_rate": 1.2504703544748495e-07, + "logits/chosen": 3.96875, + "logits/rejected": 3.70703125, + "logps/chosen": -732.5, + "logps/rejected": -565.0, + "loss": 0.4661, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.259765625, + "rewards/margins": 6.44140625, + "rewards/rejected": -4.1748046875, + "step": 4557 + }, + { + "epoch": 0.9044099409692941, + "grad_norm": 25.860819870284118, + "learning_rate": 1.249446082295779e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 4.1640625, + "logps/chosen": -765.5, + "logps/rejected": -557.0, + "loss": 0.3336, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.49609375, + "rewards/margins": 8.6171875, + "rewards/rejected": -5.1328125, + "step": 4558 + }, + { + "epoch": 0.9046083635100948, + "grad_norm": 28.374032879845096, + "learning_rate": 1.248423849031918e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 4.09765625, + "logps/chosen": -1175.0, + "logps/rejected": -835.0, + "loss": 0.3616, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.4140625, + "rewards/margins": 7.8984375, + "rewards/rejected": -4.46484375, + "step": 4559 + }, + { + "epoch": 0.9048067860508954, + "grad_norm": 30.43949343904463, + "learning_rate": 1.247403655173615e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 4.1796875, + "logps/chosen": -1225.0, + "logps/rejected": -1008.5, + "loss": 0.2983, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1357421875, + "rewards/margins": 8.8828125, + "rewards/rejected": -6.76953125, + "step": 4560 + }, + { + "epoch": 0.905005208591696, + "grad_norm": 30.602080536174483, + "learning_rate": 1.2463855012102382e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 4.265625, + "logps/chosen": -1039.0, + "logps/rejected": -804.5, + "loss": 0.4638, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.607421875, + "rewards/margins": 7.578125, + "rewards/rejected": -4.9765625, + "step": 4561 + }, + { + "epoch": 0.9052036311324967, + "grad_norm": 34.94613797950033, + "learning_rate": 1.2453693876301783e-07, + "logits/chosen": 3.390625, + "logits/rejected": 3.48046875, + "logps/chosen": -967.0, + "logps/rejected": -816.0, + "loss": 0.378, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.4375, + "rewards/margins": 9.4140625, + "rewards/rejected": -5.96875, + "step": 4562 + }, + { + "epoch": 0.9054020536732973, + "grad_norm": 29.52348416609881, + "learning_rate": 1.244355314920848e-07, + "logits/chosen": 4.0, + "logits/rejected": 4.0703125, + "logps/chosen": -1017.0, + "logps/rejected": -679.0, + "loss": 0.4376, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.1806640625, + "rewards/margins": 6.9921875, + "rewards/rejected": -4.8125, + "step": 4563 + }, + { + "epoch": 0.9056004762140979, + "grad_norm": 27.764653100888182, + "learning_rate": 1.243343283568678e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 3.7109375, + "logps/chosen": -910.0, + "logps/rejected": -539.0, + "loss": 0.4276, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.927734375, + "rewards/margins": 6.4765625, + "rewards/rejected": -4.56640625, + "step": 4564 + }, + { + "epoch": 0.9057988987548986, + "grad_norm": 33.20875166113347, + "learning_rate": 1.2423332940591238e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 3.7109375, + "logps/chosen": -949.0, + "logps/rejected": -597.5, + "loss": 0.4441, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.3756103515625, + "rewards/margins": 5.953125, + "rewards/rejected": -4.5625, + "step": 4565 + }, + { + "epoch": 0.9059973212956992, + "grad_norm": 33.45146403837966, + "learning_rate": 1.2413253468766565e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.12109375, + "logps/chosen": -789.0, + "logps/rejected": -592.5, + "loss": 0.5242, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.01953125, + "rewards/margins": 4.650390625, + "rewards/rejected": -2.6298828125, + "step": 4566 + }, + { + "epoch": 0.9061957438364998, + "grad_norm": 28.43067723932987, + "learning_rate": 1.2403194425047735e-07, + "logits/chosen": 4.734375, + "logits/rejected": 4.46875, + "logps/chosen": -1029.0, + "logps/rejected": -657.5, + "loss": 0.4911, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8931884765625, + "rewards/margins": 6.328125, + "rewards/rejected": -4.4296875, + "step": 4567 + }, + { + "epoch": 0.9063941663773004, + "grad_norm": 36.136645563178796, + "learning_rate": 1.2393155814259883e-07, + "logits/chosen": 3.6328125, + "logits/rejected": 3.81640625, + "logps/chosen": -986.0, + "logps/rejected": -574.5, + "loss": 0.4659, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.6328125, + "rewards/margins": 6.3671875, + "rewards/rejected": -4.73046875, + "step": 4568 + }, + { + "epoch": 0.9065925889181011, + "grad_norm": 29.468965502731283, + "learning_rate": 1.2383137641218347e-07, + "logits/chosen": 3.5234375, + "logits/rejected": 3.32421875, + "logps/chosen": -1105.0, + "logps/rejected": -866.0, + "loss": 0.3387, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.64453125, + "rewards/margins": 8.9453125, + "rewards/rejected": -6.3125, + "step": 4569 + }, + { + "epoch": 0.9067910114589017, + "grad_norm": 29.135261722012533, + "learning_rate": 1.2373139910728668e-07, + "logits/chosen": 3.56640625, + "logits/rejected": 3.625, + "logps/chosen": -1073.0, + "logps/rejected": -1054.0, + "loss": 0.3432, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7421875, + "rewards/margins": 10.7265625, + "rewards/rejected": -8.0, + "step": 4570 + }, + { + "epoch": 0.9069894339997023, + "grad_norm": 38.39866455605511, + "learning_rate": 1.2363162627586574e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.0546875, + "logps/chosen": -1202.0, + "logps/rejected": -769.5, + "loss": 0.478, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.6845703125, + "rewards/margins": 6.6171875, + "rewards/rejected": -4.91796875, + "step": 4571 + }, + { + "epoch": 0.907187856540503, + "grad_norm": 30.520515066392825, + "learning_rate": 1.2353205796578005e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 4.109375, + "logps/chosen": -994.5, + "logps/rejected": -743.0, + "loss": 0.3738, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.638671875, + "rewards/margins": 7.7265625, + "rewards/rejected": -5.09375, + "step": 4572 + }, + { + "epoch": 0.9073862790813036, + "grad_norm": 31.024060774921402, + "learning_rate": 1.2343269422479073e-07, + "logits/chosen": 3.5078125, + "logits/rejected": 3.6328125, + "logps/chosen": -997.5, + "logps/rejected": -1390.0, + "loss": 0.4903, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.1572265625, + "rewards/margins": 10.8046875, + "rewards/rejected": -8.640625, + "step": 4573 + }, + { + "epoch": 0.9075847016221043, + "grad_norm": 39.06682812361857, + "learning_rate": 1.233335351005607e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 3.77734375, + "logps/chosen": -999.0, + "logps/rejected": -872.0, + "loss": 0.4451, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.521484375, + "rewards/margins": 6.875, + "rewards/rejected": -4.37109375, + "step": 4574 + }, + { + "epoch": 0.9077831241629049, + "grad_norm": 33.90458613892482, + "learning_rate": 1.2323458064065495e-07, + "logits/chosen": 4.14453125, + "logits/rejected": 3.83203125, + "logps/chosen": -1038.0, + "logps/rejected": -708.5, + "loss": 0.3241, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.755859375, + "rewards/margins": 9.140625, + "rewards/rejected": -6.390625, + "step": 4575 + }, + { + "epoch": 0.9079815467037056, + "grad_norm": 51.65939708555796, + "learning_rate": 1.2313583089254005e-07, + "logits/chosen": 3.7734375, + "logits/rejected": 3.87890625, + "logps/chosen": -664.0, + "logps/rejected": -831.5, + "loss": 0.5678, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.0576171875, + "rewards/margins": 7.609375, + "rewards/rejected": -6.55078125, + "step": 4576 + }, + { + "epoch": 0.9081799692445062, + "grad_norm": 28.521160475196787, + "learning_rate": 1.2303728590358454e-07, + "logits/chosen": 4.328125, + "logits/rejected": 4.24609375, + "logps/chosen": -1399.0, + "logps/rejected": -967.0, + "loss": 0.2943, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4921875, + "rewards/margins": 10.703125, + "rewards/rejected": -7.203125, + "step": 4577 + }, + { + "epoch": 0.9083783917853068, + "grad_norm": 37.411667969220545, + "learning_rate": 1.2293894572105877e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 3.8515625, + "logps/chosen": -910.0, + "logps/rejected": -625.0, + "loss": 0.3157, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.14990234375, + "rewards/margins": 9.0078125, + "rewards/rejected": -5.8515625, + "step": 4578 + }, + { + "epoch": 0.9085768143261075, + "grad_norm": 28.861207241362422, + "learning_rate": 1.2284081039213476e-07, + "logits/chosen": 3.54296875, + "logits/rejected": 3.33203125, + "logps/chosen": -1094.0, + "logps/rejected": -897.0, + "loss": 0.3597, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4560546875, + "rewards/margins": 9.09375, + "rewards/rejected": -6.6171875, + "step": 4579 + }, + { + "epoch": 0.9087752368669081, + "grad_norm": 31.11979683660015, + "learning_rate": 1.227428799638862e-07, + "logits/chosen": 3.66796875, + "logits/rejected": 3.859375, + "logps/chosen": -743.0, + "logps/rejected": -1312.75, + "loss": 0.4832, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1826171875, + "rewards/margins": 9.265625, + "rewards/rejected": -8.06640625, + "step": 4580 + }, + { + "epoch": 0.9089736594077087, + "grad_norm": 28.655370702515576, + "learning_rate": 1.2264515448328873e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 3.7734375, + "logps/chosen": -1104.0, + "logps/rejected": -687.0, + "loss": 0.3847, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.85546875, + "rewards/margins": 8.2421875, + "rewards/rejected": -5.39453125, + "step": 4581 + }, + { + "epoch": 0.9091720819485094, + "grad_norm": 26.7398784859144, + "learning_rate": 1.225476339972193e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 3.76953125, + "logps/chosen": -1054.0, + "logps/rejected": -762.5, + "loss": 0.152, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.35546875, + "rewards/margins": 11.84375, + "rewards/rejected": -8.5234375, + "step": 4582 + }, + { + "epoch": 0.90937050448931, + "grad_norm": 28.480853858308393, + "learning_rate": 1.2245031855245705e-07, + "logits/chosen": 4.21875, + "logits/rejected": 3.8671875, + "logps/chosen": -918.0, + "logps/rejected": -1340.0, + "loss": 0.3751, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.033203125, + "rewards/margins": 10.2265625, + "rewards/rejected": -8.203125, + "step": 4583 + }, + { + "epoch": 0.9095689270301106, + "grad_norm": 25.847332008068882, + "learning_rate": 1.2235320819568216e-07, + "logits/chosen": 3.953125, + "logits/rejected": 4.12890625, + "logps/chosen": -837.5, + "logps/rejected": -738.0, + "loss": 0.4621, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.376953125, + "rewards/margins": 8.890625, + "rewards/rejected": -5.513671875, + "step": 4584 + }, + { + "epoch": 0.9097673495709112, + "grad_norm": 34.70767726391251, + "learning_rate": 1.2225630297347704e-07, + "logits/chosen": 3.65625, + "logits/rejected": 3.88671875, + "logps/chosen": -844.0, + "logps/rejected": -794.0, + "loss": 0.3905, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.57861328125, + "rewards/margins": 8.4765625, + "rewards/rejected": -6.8828125, + "step": 4585 + }, + { + "epoch": 0.9099657721117119, + "grad_norm": 32.76124046091352, + "learning_rate": 1.221596029323251e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 4.10546875, + "logps/chosen": -823.5, + "logps/rejected": -636.0, + "loss": 0.4475, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.2421875, + "rewards/margins": 7.06640625, + "rewards/rejected": -4.8251953125, + "step": 4586 + }, + { + "epoch": 0.9101641946525125, + "grad_norm": 31.372878501288618, + "learning_rate": 1.2206310811861182e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.3046875, + "logps/chosen": -1309.0, + "logps/rejected": -810.0, + "loss": 0.3499, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.43359375, + "rewards/margins": 10.4140625, + "rewards/rejected": -6.9921875, + "step": 4587 + }, + { + "epoch": 0.9103626171933131, + "grad_norm": 34.64259564558171, + "learning_rate": 1.2196681857862406e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 3.93359375, + "logps/chosen": -984.0, + "logps/rejected": -764.0, + "loss": 0.3863, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.12109375, + "rewards/margins": 9.3125, + "rewards/rejected": -7.1953125, + "step": 4588 + }, + { + "epoch": 0.9105610397341138, + "grad_norm": 20.988976540749213, + "learning_rate": 1.218707343585501e-07, + "logits/chosen": 3.66015625, + "logits/rejected": 3.9453125, + "logps/chosen": -785.0, + "logps/rejected": -1167.0, + "loss": 0.3637, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5703125, + "rewards/margins": 11.03125, + "rewards/rejected": -8.46875, + "step": 4589 + }, + { + "epoch": 0.9107594622749144, + "grad_norm": 28.42145031518413, + "learning_rate": 1.2177485550448003e-07, + "logits/chosen": 4.328125, + "logits/rejected": 3.96484375, + "logps/chosen": -1069.0, + "logps/rejected": -708.0, + "loss": 0.4187, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.765625, + "rewards/margins": 7.9765625, + "rewards/rejected": -5.21484375, + "step": 4590 + }, + { + "epoch": 0.910957884815715, + "grad_norm": 33.443905505901704, + "learning_rate": 1.2167918206240493e-07, + "logits/chosen": 3.61328125, + "logits/rejected": 3.75390625, + "logps/chosen": -1210.0, + "logps/rejected": -873.0, + "loss": 0.386, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.291015625, + "rewards/margins": 9.8671875, + "rewards/rejected": -7.5625, + "step": 4591 + }, + { + "epoch": 0.9111563073565156, + "grad_norm": 35.52077045460881, + "learning_rate": 1.2158371407821786e-07, + "logits/chosen": 4.1640625, + "logits/rejected": 4.38671875, + "logps/chosen": -719.0, + "logps/rejected": -659.5, + "loss": 0.506, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.7236328125, + "rewards/margins": 6.0078125, + "rewards/rejected": -4.28125, + "step": 4592 + }, + { + "epoch": 0.9113547298973164, + "grad_norm": 31.12917668592987, + "learning_rate": 1.2148845159771312e-07, + "logits/chosen": 3.70703125, + "logits/rejected": 3.578125, + "logps/chosen": -717.5, + "logps/rejected": -686.0, + "loss": 0.4045, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.1875, + "rewards/margins": 8.75, + "rewards/rejected": -6.5625, + "step": 4593 + }, + { + "epoch": 0.911553152438117, + "grad_norm": 29.997683650918894, + "learning_rate": 1.2139339466658632e-07, + "logits/chosen": 4.1875, + "logits/rejected": 3.9765625, + "logps/chosen": -1123.0, + "logps/rejected": -697.5, + "loss": 0.2344, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.05078125, + "rewards/margins": 11.015625, + "rewards/rejected": -7.9765625, + "step": 4594 + }, + { + "epoch": 0.9117515749789176, + "grad_norm": 37.06441889725956, + "learning_rate": 1.2129854333043474e-07, + "logits/chosen": 3.67578125, + "logits/rejected": 3.890625, + "logps/chosen": -863.5, + "logps/rejected": -780.5, + "loss": 0.5102, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.721923828125, + "rewards/margins": 6.41796875, + "rewards/rejected": -4.6953125, + "step": 4595 + }, + { + "epoch": 0.9119499975197183, + "grad_norm": 25.742648980155174, + "learning_rate": 1.2120389763475665e-07, + "logits/chosen": 4.09375, + "logits/rejected": 4.1484375, + "logps/chosen": -976.0, + "logps/rejected": -1417.5, + "loss": 0.3135, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9453125, + "rewards/margins": 10.84375, + "rewards/rejected": -7.86328125, + "step": 4596 + }, + { + "epoch": 0.9121484200605189, + "grad_norm": 40.5743930182014, + "learning_rate": 1.2110945762495212e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.015625, + "logps/chosen": -1124.0, + "logps/rejected": -811.0, + "loss": 0.3773, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.34765625, + "rewards/margins": 8.9765625, + "rewards/rejected": -6.6171875, + "step": 4597 + }, + { + "epoch": 0.9123468426013195, + "grad_norm": 25.977535287117313, + "learning_rate": 1.2101522334632228e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 3.86328125, + "logps/chosen": -858.0, + "logps/rejected": -701.5, + "loss": 0.5046, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.048828125, + "rewards/margins": 8.3203125, + "rewards/rejected": -6.265625, + "step": 4598 + }, + { + "epoch": 0.9125452651421202, + "grad_norm": 36.09380570181534, + "learning_rate": 1.209211948440696e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.875, + "logps/chosen": -993.0, + "logps/rejected": -611.0, + "loss": 0.3035, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.56640625, + "rewards/margins": 8.9375, + "rewards/rejected": -6.359375, + "step": 4599 + }, + { + "epoch": 0.9127436876829208, + "grad_norm": 28.57149416498302, + "learning_rate": 1.2082737216329792e-07, + "logits/chosen": 4.19140625, + "logits/rejected": 3.90234375, + "logps/chosen": -898.0, + "logps/rejected": -617.0, + "loss": 0.3892, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4287109375, + "rewards/margins": 7.671875, + "rewards/rejected": -6.234375, + "step": 4600 + }, + { + "epoch": 0.9129421102237214, + "grad_norm": 34.49738177960005, + "learning_rate": 1.2073375534901226e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 3.8828125, + "logps/chosen": -904.0, + "logps/rejected": -630.5, + "loss": 0.4763, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.85546875, + "rewards/margins": 6.984375, + "rewards/rejected": -5.1171875, + "step": 4601 + }, + { + "epoch": 0.913140532764522, + "grad_norm": 32.721251902080475, + "learning_rate": 1.20640344446119e-07, + "logits/chosen": 3.7734375, + "logits/rejected": 3.49609375, + "logps/chosen": -780.0, + "logps/rejected": -573.5, + "loss": 0.4457, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.67578125, + "rewards/margins": 7.328125, + "rewards/rejected": -5.65625, + "step": 4602 + }, + { + "epoch": 0.9133389553053227, + "grad_norm": 39.58117906051873, + "learning_rate": 1.2054713949942574e-07, + "logits/chosen": 4.03125, + "logits/rejected": 3.75390625, + "logps/chosen": -1041.0, + "logps/rejected": -666.0, + "loss": 0.3401, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.63671875, + "rewards/margins": 7.8984375, + "rewards/rejected": -5.26171875, + "step": 4603 + }, + { + "epoch": 0.9135373778461233, + "grad_norm": 33.94960180331115, + "learning_rate": 1.204541405536411e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 4.02734375, + "logps/chosen": -765.5, + "logps/rejected": -640.5, + "loss": 0.4229, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.8515625, + "rewards/margins": 7.921875, + "rewards/rejected": -6.0703125, + "step": 4604 + }, + { + "epoch": 0.9137358003869239, + "grad_norm": 34.38792778919776, + "learning_rate": 1.2036134765337513e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.2265625, + "logps/chosen": -699.5, + "logps/rejected": -592.5, + "loss": 0.4938, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.9443359375, + "rewards/margins": 5.8671875, + "rewards/rejected": -3.921875, + "step": 4605 + }, + { + "epoch": 0.9139342229277246, + "grad_norm": 33.76901888590354, + "learning_rate": 1.2026876084313888e-07, + "logits/chosen": 4.42578125, + "logits/rejected": 4.44921875, + "logps/chosen": -755.0, + "logps/rejected": -889.0, + "loss": 0.5248, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.35986328125, + "rewards/margins": 6.5078125, + "rewards/rejected": -5.1484375, + "step": 4606 + }, + { + "epoch": 0.9141326454685252, + "grad_norm": 31.524427894284912, + "learning_rate": 1.2017638016734463e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.94921875, + "logps/chosen": -765.0, + "logps/rejected": -676.5, + "loss": 0.3363, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.47216796875, + "rewards/margins": 8.8203125, + "rewards/rejected": -6.34375, + "step": 4607 + }, + { + "epoch": 0.9143310680093258, + "grad_norm": 34.009671616811595, + "learning_rate": 1.2008420567030575e-07, + "logits/chosen": 3.609375, + "logits/rejected": 3.390625, + "logps/chosen": -1013.0, + "logps/rejected": -674.0, + "loss": 0.4695, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.44921875, + "rewards/margins": 6.896484375, + "rewards/rejected": -4.447998046875, + "step": 4608 + }, + { + "epoch": 0.9145294905501264, + "grad_norm": 25.25583820419381, + "learning_rate": 1.1999223739623666e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.16796875, + "logps/chosen": -1037.0, + "logps/rejected": -855.5, + "loss": 0.5038, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.794921875, + "rewards/margins": 7.6484375, + "rewards/rejected": -4.87109375, + "step": 4609 + }, + { + "epoch": 0.9147279130909272, + "grad_norm": 35.30158232095903, + "learning_rate": 1.19900475389253e-07, + "logits/chosen": 4.4765625, + "logits/rejected": 4.63671875, + "logps/chosen": -956.0, + "logps/rejected": -1491.0, + "loss": 0.4167, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6640625, + "rewards/margins": 12.296875, + "rewards/rejected": -9.6484375, + "step": 4610 + }, + { + "epoch": 0.9149263356317278, + "grad_norm": 30.659134275892356, + "learning_rate": 1.1980891969337128e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.51171875, + "logps/chosen": -1049.0, + "logps/rejected": -887.0, + "loss": 0.4103, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5146484375, + "rewards/margins": 8.75, + "rewards/rejected": -6.234375, + "step": 4611 + }, + { + "epoch": 0.9151247581725284, + "grad_norm": 31.26828061966177, + "learning_rate": 1.197175703525092e-07, + "logits/chosen": 4.54296875, + "logits/rejected": 4.6640625, + "logps/chosen": -868.0, + "logps/rejected": -1291.5, + "loss": 0.5265, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.859375, + "rewards/margins": 8.5703125, + "rewards/rejected": -6.70703125, + "step": 4612 + }, + { + "epoch": 0.9153231807133291, + "grad_norm": 35.712041527322015, + "learning_rate": 1.1962642741048543e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 4.15625, + "logps/chosen": -968.0, + "logps/rejected": -590.0, + "loss": 0.4987, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.96875, + "rewards/margins": 7.25, + "rewards/rejected": -5.27734375, + "step": 4613 + }, + { + "epoch": 0.9155216032541297, + "grad_norm": 20.439666988871753, + "learning_rate": 1.1953549091101948e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.0390625, + "logps/chosen": -939.5, + "logps/rejected": -637.5, + "loss": 0.4065, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.72265625, + "rewards/margins": 8.53125, + "rewards/rejected": -5.796875, + "step": 4614 + }, + { + "epoch": 0.9157200257949303, + "grad_norm": 17.843006700932634, + "learning_rate": 1.194447608977322e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 3.92578125, + "logps/chosen": -1592.0, + "logps/rejected": -935.0, + "loss": 0.2265, + "rewards/accuracies": 0.90625, + "rewards/chosen": 4.53125, + "rewards/margins": 12.421875, + "rewards/rejected": -7.90625, + "step": 4615 + }, + { + "epoch": 0.9159184483357309, + "grad_norm": 31.40903126469993, + "learning_rate": 1.1935423741414494e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.140625, + "logps/chosen": -935.0, + "logps/rejected": -683.0, + "loss": 0.4628, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.232421875, + "rewards/margins": 6.546875, + "rewards/rejected": -4.3125, + "step": 4616 + }, + { + "epoch": 0.9161168708765316, + "grad_norm": 23.52395661317516, + "learning_rate": 1.192639205036804e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 3.7421875, + "logps/chosen": -961.0, + "logps/rejected": -726.0, + "loss": 0.3764, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.234375, + "rewards/margins": 8.9921875, + "rewards/rejected": -6.75390625, + "step": 4617 + }, + { + "epoch": 0.9163152934173322, + "grad_norm": 29.271530137008984, + "learning_rate": 1.1917381020966187e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.3125, + "logps/chosen": -1044.0, + "logps/rejected": -848.0, + "loss": 0.3263, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.38671875, + "rewards/margins": 9.7265625, + "rewards/rejected": -6.328125, + "step": 4618 + }, + { + "epoch": 0.9165137159581328, + "grad_norm": 39.077268841082166, + "learning_rate": 1.1908390657531368e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.11328125, + "logps/chosen": -712.0, + "logps/rejected": -728.0, + "loss": 0.4739, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.41357421875, + "rewards/margins": 7.625, + "rewards/rejected": -5.20703125, + "step": 4619 + }, + { + "epoch": 0.9167121384989335, + "grad_norm": 25.294537362746826, + "learning_rate": 1.1899420964376114e-07, + "logits/chosen": 3.73828125, + "logits/rejected": 3.484375, + "logps/chosen": -1256.0, + "logps/rejected": -717.5, + "loss": 0.3554, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.015625, + "rewards/margins": 8.90625, + "rewards/rejected": -5.8984375, + "step": 4620 + }, + { + "epoch": 0.9169105610397341, + "grad_norm": 26.230957652723554, + "learning_rate": 1.1890471945802999e-07, + "logits/chosen": 3.921875, + "logits/rejected": 3.6328125, + "logps/chosen": -1172.5, + "logps/rejected": -785.0, + "loss": 0.4078, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.853515625, + "rewards/margins": 7.3984375, + "rewards/rejected": -5.546875, + "step": 4621 + }, + { + "epoch": 0.9171089835805347, + "grad_norm": 30.013400889423966, + "learning_rate": 1.1881543606104732e-07, + "logits/chosen": 3.8203125, + "logits/rejected": 3.76171875, + "logps/chosen": -1020.0, + "logps/rejected": -777.0, + "loss": 0.3317, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.171875, + "rewards/margins": 8.5390625, + "rewards/rejected": -5.3515625, + "step": 4622 + }, + { + "epoch": 0.9173074061213354, + "grad_norm": 28.44204795716483, + "learning_rate": 1.1872635949564078e-07, + "logits/chosen": 3.765625, + "logits/rejected": 3.92578125, + "logps/chosen": -938.0, + "logps/rejected": -902.0, + "loss": 0.4886, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.1796875, + "rewards/margins": 14.0703125, + "rewards/rejected": -11.90625, + "step": 4623 + }, + { + "epoch": 0.917505828662136, + "grad_norm": 30.33416066070719, + "learning_rate": 1.1863748980453873e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 3.87109375, + "logps/chosen": -957.0, + "logps/rejected": -602.0, + "loss": 0.3217, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.28125, + "rewards/margins": 9.1796875, + "rewards/rejected": -5.90625, + "step": 4624 + }, + { + "epoch": 0.9177042512029366, + "grad_norm": 36.61693120359012, + "learning_rate": 1.1854882703037039e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 4.09375, + "logps/chosen": -1308.0, + "logps/rejected": -932.0, + "loss": 0.3217, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.90625, + "rewards/margins": 11.046875, + "rewards/rejected": -7.140625, + "step": 4625 + }, + { + "epoch": 0.9179026737437372, + "grad_norm": 26.272932496198095, + "learning_rate": 1.1846037121566568e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 3.859375, + "logps/chosen": -749.5, + "logps/rejected": -511.0, + "loss": 0.4773, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.546875, + "rewards/margins": 7.3046875, + "rewards/rejected": -4.7578125, + "step": 4626 + }, + { + "epoch": 0.918101096284538, + "grad_norm": 32.46014243696017, + "learning_rate": 1.183721224028555e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 4.2265625, + "logps/chosen": -1073.0, + "logps/rejected": -860.0, + "loss": 0.5139, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1328125, + "rewards/margins": 12.8359375, + "rewards/rejected": -9.728515625, + "step": 4627 + }, + { + "epoch": 0.9182995188253386, + "grad_norm": 29.23606428346772, + "learning_rate": 1.1828408063427094e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 3.90625, + "logps/chosen": -910.0, + "logps/rejected": -1677.0, + "loss": 0.3181, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.19140625, + "rewards/margins": 11.46875, + "rewards/rejected": -9.296875, + "step": 4628 + }, + { + "epoch": 0.9184979413661392, + "grad_norm": 40.17515002759322, + "learning_rate": 1.1819624595214425e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.421875, + "logps/chosen": -898.0, + "logps/rejected": -641.0, + "loss": 0.3738, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6796875, + "rewards/margins": 7.640625, + "rewards/rejected": -4.96875, + "step": 4629 + }, + { + "epoch": 0.9186963639069399, + "grad_norm": 27.098155674765163, + "learning_rate": 1.1810861839860814e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.1015625, + "logps/chosen": -1100.5, + "logps/rejected": -813.0, + "loss": 0.3454, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.544921875, + "rewards/margins": 9.1328125, + "rewards/rejected": -5.5703125, + "step": 4630 + }, + { + "epoch": 0.9188947864477405, + "grad_norm": 24.810928985576748, + "learning_rate": 1.1802119801569594e-07, + "logits/chosen": 4.28515625, + "logits/rejected": 4.4140625, + "logps/chosen": -952.0, + "logps/rejected": -666.0, + "loss": 0.5766, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.3046875, + "rewards/margins": 5.8046875, + "rewards/rejected": -3.501953125, + "step": 4631 + }, + { + "epoch": 0.9190932089885411, + "grad_norm": 27.89709467441081, + "learning_rate": 1.1793398484534175e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 4.1328125, + "logps/chosen": -1530.0, + "logps/rejected": -853.0, + "loss": 0.4068, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.4794921875, + "rewards/margins": 9.2890625, + "rewards/rejected": -5.80859375, + "step": 4632 + }, + { + "epoch": 0.9192916315293417, + "grad_norm": 30.12113752062722, + "learning_rate": 1.1784697892938011e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 3.89453125, + "logps/chosen": -791.0, + "logps/rejected": -545.0, + "loss": 0.3878, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.328125, + "rewards/margins": 6.28125, + "rewards/rejected": -3.9609375, + "step": 4633 + }, + { + "epoch": 0.9194900540701424, + "grad_norm": 34.55209095207353, + "learning_rate": 1.1776018030954619e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 3.87890625, + "logps/chosen": -884.0, + "logps/rejected": -570.5, + "loss": 0.3692, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8984375, + "rewards/margins": 7.2890625, + "rewards/rejected": -4.38671875, + "step": 4634 + }, + { + "epoch": 0.919688476610943, + "grad_norm": 24.945233363376165, + "learning_rate": 1.1767358902747577e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.13671875, + "logps/chosen": -930.0, + "logps/rejected": -688.5, + "loss": 0.4327, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8603515625, + "rewards/margins": 7.234375, + "rewards/rejected": -5.359375, + "step": 4635 + }, + { + "epoch": 0.9198868991517436, + "grad_norm": 33.89940446546879, + "learning_rate": 1.1758720512470523e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.109375, + "logps/chosen": -1072.0, + "logps/rejected": -834.0, + "loss": 0.3783, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.388671875, + "rewards/margins": 9.5234375, + "rewards/rejected": -7.1171875, + "step": 4636 + }, + { + "epoch": 0.9200853216925443, + "grad_norm": 36.44118849896755, + "learning_rate": 1.1750102864267128e-07, + "logits/chosen": 3.87109375, + "logits/rejected": 3.953125, + "logps/chosen": -730.5, + "logps/rejected": -643.5, + "loss": 0.3848, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5859375, + "rewards/margins": 7.7734375, + "rewards/rejected": -5.1875, + "step": 4637 + }, + { + "epoch": 0.9202837442333449, + "grad_norm": 35.14617779506139, + "learning_rate": 1.1741505962271129e-07, + "logits/chosen": 3.7734375, + "logits/rejected": 3.609375, + "logps/chosen": -796.0, + "logps/rejected": -611.5, + "loss": 0.4602, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1875, + "rewards/margins": 6.9375, + "rewards/rejected": -4.74609375, + "step": 4638 + }, + { + "epoch": 0.9204821667741455, + "grad_norm": 34.14957716886143, + "learning_rate": 1.1732929810606306e-07, + "logits/chosen": 3.58203125, + "logits/rejected": 3.64453125, + "logps/chosen": -644.5, + "logps/rejected": -652.5, + "loss": 0.4131, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.9892578125, + "rewards/margins": 10.4296875, + "rewards/rejected": -8.45703125, + "step": 4639 + }, + { + "epoch": 0.9206805893149462, + "grad_norm": 23.659323259931895, + "learning_rate": 1.1724374413386498e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.359375, + "logps/chosen": -855.0, + "logps/rejected": -639.0, + "loss": 0.3727, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.283203125, + "rewards/margins": 8.28125, + "rewards/rejected": -6.0, + "step": 4640 + }, + { + "epoch": 0.9208790118557468, + "grad_norm": 37.83264719372619, + "learning_rate": 1.1715839774715555e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.89453125, + "logps/chosen": -1009.0, + "logps/rejected": -680.5, + "loss": 0.5102, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.431640625, + "rewards/margins": 6.7421875, + "rewards/rejected": -4.30859375, + "step": 4641 + }, + { + "epoch": 0.9210774343965474, + "grad_norm": 28.78354921521601, + "learning_rate": 1.1707325898687409e-07, + "logits/chosen": 3.77734375, + "logits/rejected": 3.78515625, + "logps/chosen": -985.0, + "logps/rejected": -681.0, + "loss": 0.3288, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.109375, + "rewards/margins": 8.3671875, + "rewards/rejected": -5.25390625, + "step": 4642 + }, + { + "epoch": 0.921275856937348, + "grad_norm": 36.88861486508789, + "learning_rate": 1.1698832789386009e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.15625, + "logps/chosen": -1105.0, + "logps/rejected": -960.0, + "loss": 0.4015, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.109375, + "rewards/margins": 9.90625, + "rewards/rejected": -6.8046875, + "step": 4643 + }, + { + "epoch": 0.9214742794781488, + "grad_norm": 33.04294974229393, + "learning_rate": 1.1690360450885345e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 4.09375, + "logps/chosen": -1214.0, + "logps/rejected": -1398.0, + "loss": 0.3903, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.705078125, + "rewards/margins": 11.5078125, + "rewards/rejected": -8.8046875, + "step": 4644 + }, + { + "epoch": 0.9216727020189494, + "grad_norm": 29.032495805613994, + "learning_rate": 1.1681908887249458e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 4.0, + "logps/chosen": -1010.5, + "logps/rejected": -753.5, + "loss": 0.3714, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.271484375, + "rewards/margins": 8.484375, + "rewards/rejected": -6.23046875, + "step": 4645 + }, + { + "epoch": 0.92187112455975, + "grad_norm": 37.63000625034827, + "learning_rate": 1.1673478102532403e-07, + "logits/chosen": 4.25390625, + "logits/rejected": 4.03125, + "logps/chosen": -1030.0, + "logps/rejected": -682.5, + "loss": 0.416, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.17578125, + "rewards/margins": 8.1484375, + "rewards/rejected": -4.984375, + "step": 4646 + }, + { + "epoch": 0.9220695471005507, + "grad_norm": 35.298754912507796, + "learning_rate": 1.1665068100778283e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 3.91796875, + "logps/chosen": -1079.0, + "logps/rejected": -1066.5, + "loss": 0.2933, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3359375, + "rewards/margins": 9.08984375, + "rewards/rejected": -5.74609375, + "step": 4647 + }, + { + "epoch": 0.9222679696413513, + "grad_norm": 38.1757539478934, + "learning_rate": 1.1656678886021226e-07, + "logits/chosen": 4.1875, + "logits/rejected": 3.63671875, + "logps/chosen": -826.5, + "logps/rejected": -1377.0, + "loss": 0.5901, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.21026611328125, + "rewards/margins": 7.25, + "rewards/rejected": -6.03515625, + "step": 4648 + }, + { + "epoch": 0.9224663921821519, + "grad_norm": 30.6674989443839, + "learning_rate": 1.1648310462285385e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.1953125, + "logps/chosen": -818.0, + "logps/rejected": -711.0, + "loss": 0.5157, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.619140625, + "rewards/margins": 10.453125, + "rewards/rejected": -7.83984375, + "step": 4649 + }, + { + "epoch": 0.9226648147229525, + "grad_norm": 25.294351235567884, + "learning_rate": 1.1639962833584951e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 3.98046875, + "logps/chosen": -1054.0, + "logps/rejected": -687.0, + "loss": 0.2923, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.423828125, + "rewards/margins": 10.15625, + "rewards/rejected": -6.7421875, + "step": 4650 + }, + { + "epoch": 0.9228632372637532, + "grad_norm": 29.86819566806509, + "learning_rate": 1.1631636003924125e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.4453125, + "logps/chosen": -666.5, + "logps/rejected": -1450.0, + "loss": 0.5322, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.01171875, + "rewards/margins": 7.90234375, + "rewards/rejected": -5.87890625, + "step": 4651 + }, + { + "epoch": 0.9230616598045538, + "grad_norm": 24.601053056012166, + "learning_rate": 1.1623329977297153e-07, + "logits/chosen": 4.203125, + "logits/rejected": 3.796875, + "logps/chosen": -1125.0, + "logps/rejected": -767.0, + "loss": 0.324, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.71484375, + "rewards/margins": 7.49609375, + "rewards/rejected": -5.7734375, + "step": 4652 + }, + { + "epoch": 0.9232600823453544, + "grad_norm": 36.68524509415603, + "learning_rate": 1.1615044757688264e-07, + "logits/chosen": 4.4140625, + "logits/rejected": 4.28125, + "logps/chosen": -876.0, + "logps/rejected": -669.0, + "loss": 0.4348, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.978515625, + "rewards/margins": 7.3125, + "rewards/rejected": -5.33203125, + "step": 4653 + }, + { + "epoch": 0.9234585048861551, + "grad_norm": 27.884838093139596, + "learning_rate": 1.1606780349071756e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 3.87890625, + "logps/chosen": -813.5, + "logps/rejected": -1232.0, + "loss": 0.3764, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.466796875, + "rewards/margins": 10.609375, + "rewards/rejected": -8.171875, + "step": 4654 + }, + { + "epoch": 0.9236569274269557, + "grad_norm": 40.86186854310069, + "learning_rate": 1.159853675541191e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.0078125, + "logps/chosen": -862.0, + "logps/rejected": -568.5, + "loss": 0.5579, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.75, + "rewards/margins": 6.28515625, + "rewards/rejected": -4.51953125, + "step": 4655 + }, + { + "epoch": 0.9238553499677563, + "grad_norm": 25.695006011762146, + "learning_rate": 1.1590313980663029e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 3.62890625, + "logps/chosen": -1224.0, + "logps/rejected": -947.0, + "loss": 0.4018, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.28125, + "rewards/margins": 9.65625, + "rewards/rejected": -6.36328125, + "step": 4656 + }, + { + "epoch": 0.924053772508557, + "grad_norm": 23.0070066949819, + "learning_rate": 1.1582112028769428e-07, + "logits/chosen": 4.03515625, + "logits/rejected": 3.9296875, + "logps/chosen": -1174.0, + "logps/rejected": -616.5, + "loss": 0.2806, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.09375, + "rewards/margins": 8.2890625, + "rewards/rejected": -5.1953125, + "step": 4657 + }, + { + "epoch": 0.9242521950493576, + "grad_norm": 31.195567558688378, + "learning_rate": 1.1573930903665446e-07, + "logits/chosen": 3.77734375, + "logits/rejected": 4.1015625, + "logps/chosen": -1156.0, + "logps/rejected": -1720.0, + "loss": 0.4101, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.94140625, + "rewards/margins": 10.625, + "rewards/rejected": -8.6796875, + "step": 4658 + }, + { + "epoch": 0.9244506175901582, + "grad_norm": 27.669133558758062, + "learning_rate": 1.1565770609275422e-07, + "logits/chosen": 4.5390625, + "logits/rejected": 4.33984375, + "logps/chosen": -1106.5, + "logps/rejected": -694.0, + "loss": 0.3939, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.732421875, + "rewards/margins": 8.69140625, + "rewards/rejected": -5.95703125, + "step": 4659 + }, + { + "epoch": 0.9246490401309588, + "grad_norm": 32.55711454764356, + "learning_rate": 1.15576311495137e-07, + "logits/chosen": 4.078125, + "logits/rejected": 3.9765625, + "logps/chosen": -1049.0, + "logps/rejected": -670.5, + "loss": 0.4054, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.62890625, + "rewards/margins": 8.1328125, + "rewards/rejected": -5.5, + "step": 4660 + }, + { + "epoch": 0.9248474626717595, + "grad_norm": 25.523781985239896, + "learning_rate": 1.1549512528284638e-07, + "logits/chosen": 3.68359375, + "logits/rejected": 3.7734375, + "logps/chosen": -836.0, + "logps/rejected": -686.0, + "loss": 0.3731, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.611328125, + "rewards/margins": 18.6015625, + "rewards/rejected": -16.0078125, + "step": 4661 + }, + { + "epoch": 0.9250458852125601, + "grad_norm": 27.855501330708638, + "learning_rate": 1.1541414749482596e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 3.73828125, + "logps/chosen": -928.0, + "logps/rejected": -850.0, + "loss": 0.2282, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.22265625, + "rewards/margins": 11.328125, + "rewards/rejected": -8.1171875, + "step": 4662 + }, + { + "epoch": 0.9252443077533608, + "grad_norm": 31.957594043779572, + "learning_rate": 1.1533337816991931e-07, + "logits/chosen": 3.5234375, + "logits/rejected": 3.69140625, + "logps/chosen": -1347.0, + "logps/rejected": -736.5, + "loss": 0.3573, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.478515625, + "rewards/margins": 7.6015625, + "rewards/rejected": -4.09765625, + "step": 4663 + }, + { + "epoch": 0.9254427302941615, + "grad_norm": 35.16413648975108, + "learning_rate": 1.1525281734686998e-07, + "logits/chosen": 3.3828125, + "logits/rejected": 3.3515625, + "logps/chosen": -787.0, + "logps/rejected": -635.0, + "loss": 0.5562, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.6026458740234375, + "rewards/margins": 6.1953125, + "rewards/rejected": -4.59375, + "step": 4664 + }, + { + "epoch": 0.9256411528349621, + "grad_norm": 30.2372965940851, + "learning_rate": 1.1517246506432172e-07, + "logits/chosen": 4.60546875, + "logits/rejected": 4.36328125, + "logps/chosen": -1488.0, + "logps/rejected": -919.0, + "loss": 0.2426, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.2109375, + "rewards/margins": 9.859375, + "rewards/rejected": -5.6484375, + "step": 4665 + }, + { + "epoch": 0.9258395753757627, + "grad_norm": 41.046166557348585, + "learning_rate": 1.1509232136081795e-07, + "logits/chosen": 4.3984375, + "logits/rejected": 4.26171875, + "logps/chosen": -858.0, + "logps/rejected": -708.0, + "loss": 0.6255, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.078125, + "rewards/margins": 5.21875, + "rewards/rejected": -3.150390625, + "step": 4666 + }, + { + "epoch": 0.9260379979165633, + "grad_norm": 26.782533172246854, + "learning_rate": 1.1501238627480227e-07, + "logits/chosen": 4.39453125, + "logits/rejected": 4.2734375, + "logps/chosen": -1371.0, + "logps/rejected": -893.5, + "loss": 0.2407, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.736328125, + "rewards/margins": 11.6875, + "rewards/rejected": -7.953125, + "step": 4667 + }, + { + "epoch": 0.926236420457364, + "grad_norm": 30.90680485277083, + "learning_rate": 1.149326598446181e-07, + "logits/chosen": 3.71484375, + "logits/rejected": 3.8359375, + "logps/chosen": -766.0, + "logps/rejected": -784.5, + "loss": 0.4335, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.06689453125, + "rewards/margins": 7.60546875, + "rewards/rejected": -4.5341796875, + "step": 4668 + }, + { + "epoch": 0.9264348429981646, + "grad_norm": 27.28701149190004, + "learning_rate": 1.1485314210850875e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.125, + "logps/chosen": -1078.0, + "logps/rejected": -775.0, + "loss": 0.2278, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.11328125, + "rewards/margins": 14.484375, + "rewards/rejected": -10.359375, + "step": 4669 + }, + { + "epoch": 0.9266332655389652, + "grad_norm": 27.84055414530782, + "learning_rate": 1.1477383310461755e-07, + "logits/chosen": 4.015625, + "logits/rejected": 4.20703125, + "logps/chosen": -788.0, + "logps/rejected": -1291.0, + "loss": 0.4774, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.72265625, + "rewards/margins": 9.609375, + "rewards/rejected": -6.90625, + "step": 4670 + }, + { + "epoch": 0.9268316880797659, + "grad_norm": 35.764609950918725, + "learning_rate": 1.1469473287098749e-07, + "logits/chosen": 4.31640625, + "logits/rejected": 4.10546875, + "logps/chosen": -1116.0, + "logps/rejected": -808.0, + "loss": 0.3232, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.80078125, + "rewards/margins": 8.625, + "rewards/rejected": -5.84375, + "step": 4671 + }, + { + "epoch": 0.9270301106205665, + "grad_norm": 21.617844136603583, + "learning_rate": 1.1461584144556174e-07, + "logits/chosen": 3.66015625, + "logits/rejected": 3.4765625, + "logps/chosen": -910.0, + "logps/rejected": -779.0, + "loss": 0.3393, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.640625, + "rewards/margins": 9.0625, + "rewards/rejected": -6.421875, + "step": 4672 + }, + { + "epoch": 0.9272285331613671, + "grad_norm": 35.45596424967479, + "learning_rate": 1.1453715886618287e-07, + "logits/chosen": 3.953125, + "logits/rejected": 4.234375, + "logps/chosen": -994.0, + "logps/rejected": -895.0, + "loss": 0.3602, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.84375, + "rewards/margins": 8.8828125, + "rewards/rejected": -6.05078125, + "step": 4673 + }, + { + "epoch": 0.9274269557021678, + "grad_norm": 28.89528304306086, + "learning_rate": 1.1445868517059371e-07, + "logits/chosen": 4.36328125, + "logits/rejected": 4.15234375, + "logps/chosen": -1190.0, + "logps/rejected": -776.0, + "loss": 0.4742, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.333984375, + "rewards/margins": 8.1328125, + "rewards/rejected": -4.8046875, + "step": 4674 + }, + { + "epoch": 0.9276253782429684, + "grad_norm": 29.096966428988637, + "learning_rate": 1.1438042039643664e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.296875, + "logps/chosen": -829.0, + "logps/rejected": -694.5, + "loss": 0.4765, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.75, + "rewards/margins": 5.9609375, + "rewards/rejected": -4.203125, + "step": 4675 + }, + { + "epoch": 0.927823800783769, + "grad_norm": 34.089762626508225, + "learning_rate": 1.143023645812538e-07, + "logits/chosen": 4.1875, + "logits/rejected": 4.08984375, + "logps/chosen": -1331.0, + "logps/rejected": -1423.0, + "loss": 0.3768, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.0078125, + "rewards/margins": 9.78125, + "rewards/rejected": -7.74609375, + "step": 4676 + }, + { + "epoch": 0.9280222233245696, + "grad_norm": 23.712624555491658, + "learning_rate": 1.142245177624874e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.8515625, + "logps/chosen": -1296.0, + "logps/rejected": -890.0, + "loss": 0.3169, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.34375, + "rewards/margins": 9.25, + "rewards/rejected": -5.9140625, + "step": 4677 + }, + { + "epoch": 0.9282206458653703, + "grad_norm": 35.02844097525133, + "learning_rate": 1.1414687997747887e-07, + "logits/chosen": 4.40234375, + "logits/rejected": 4.43359375, + "logps/chosen": -968.0, + "logps/rejected": -1681.0, + "loss": 0.5277, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.814453125, + "rewards/margins": 16.61328125, + "rewards/rejected": -13.779296875, + "step": 4678 + }, + { + "epoch": 0.9284190684061709, + "grad_norm": 26.36410196432372, + "learning_rate": 1.1406945126346994e-07, + "logits/chosen": 3.61328125, + "logits/rejected": 3.71484375, + "logps/chosen": -1219.0, + "logps/rejected": -747.0, + "loss": 0.2855, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.63671875, + "rewards/margins": 8.6015625, + "rewards/rejected": -5.953125, + "step": 4679 + }, + { + "epoch": 0.9286174909469715, + "grad_norm": 42.27411143678077, + "learning_rate": 1.1399223165760169e-07, + "logits/chosen": 3.93359375, + "logits/rejected": 4.04296875, + "logps/chosen": -735.0, + "logps/rejected": -571.0, + "loss": 0.4512, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.9921875, + "rewards/margins": 5.5, + "rewards/rejected": -3.4921875, + "step": 4680 + }, + { + "epoch": 0.9288159134877723, + "grad_norm": 31.34680209123411, + "learning_rate": 1.1391522119691496e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 3.91796875, + "logps/chosen": -964.0, + "logps/rejected": -656.0, + "loss": 0.2698, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.64453125, + "rewards/margins": 9.3515625, + "rewards/rejected": -6.7109375, + "step": 4681 + }, + { + "epoch": 0.9290143360285729, + "grad_norm": 27.349805448220362, + "learning_rate": 1.1383841991835032e-07, + "logits/chosen": 3.85546875, + "logits/rejected": 4.0625, + "logps/chosen": -511.0, + "logps/rejected": -529.5, + "loss": 0.5669, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.84375, + "rewards/margins": 5.3125, + "rewards/rejected": -3.4609375, + "step": 4682 + }, + { + "epoch": 0.9292127585693735, + "grad_norm": 27.421185454734484, + "learning_rate": 1.1376182785874792e-07, + "logits/chosen": 3.48046875, + "logits/rejected": 3.59765625, + "logps/chosen": -1214.0, + "logps/rejected": -1235.0, + "loss": 0.3498, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.619140625, + "rewards/margins": 12.11328125, + "rewards/rejected": -9.513671875, + "step": 4683 + }, + { + "epoch": 0.9294111811101741, + "grad_norm": 31.226768594188055, + "learning_rate": 1.136854450548478e-07, + "logits/chosen": 3.73828125, + "logits/rejected": 3.55078125, + "logps/chosen": -975.5, + "logps/rejected": -698.5, + "loss": 0.4469, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3017578125, + "rewards/margins": 7.5234375, + "rewards/rejected": -5.23046875, + "step": 4684 + }, + { + "epoch": 0.9296096036509748, + "grad_norm": 27.08136283870044, + "learning_rate": 1.1360927154328922e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.2421875, + "logps/chosen": -771.0, + "logps/rejected": -666.5, + "loss": 0.4007, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.962890625, + "rewards/margins": 8.2109375, + "rewards/rejected": -5.26171875, + "step": 4685 + }, + { + "epoch": 0.9298080261917754, + "grad_norm": 29.32811498441075, + "learning_rate": 1.1353330736061144e-07, + "logits/chosen": 4.05078125, + "logits/rejected": 4.0859375, + "logps/chosen": -914.0, + "logps/rejected": -776.0, + "loss": 0.3668, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.79296875, + "rewards/margins": 10.3125, + "rewards/rejected": -7.53515625, + "step": 4686 + }, + { + "epoch": 0.930006448732576, + "grad_norm": 31.842425462978763, + "learning_rate": 1.1345755254325298e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 3.85546875, + "logps/chosen": -1125.0, + "logps/rejected": -926.0, + "loss": 0.3233, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.68359375, + "rewards/margins": 9.6171875, + "rewards/rejected": -5.95703125, + "step": 4687 + }, + { + "epoch": 0.9302048712733767, + "grad_norm": 27.729241094252785, + "learning_rate": 1.1338200712755219e-07, + "logits/chosen": 4.3515625, + "logits/rejected": 4.16796875, + "logps/chosen": -978.0, + "logps/rejected": -717.0, + "loss": 0.4745, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.921875, + "rewards/margins": 7.1796875, + "rewards/rejected": -4.2578125, + "step": 4688 + }, + { + "epoch": 0.9304032938141773, + "grad_norm": 28.044889973852204, + "learning_rate": 1.1330667114974672e-07, + "logits/chosen": 4.5625, + "logits/rejected": 4.38671875, + "logps/chosen": -887.0, + "logps/rejected": -625.0, + "loss": 0.5234, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.33984375, + "rewards/margins": 6.63671875, + "rewards/rejected": -4.3017578125, + "step": 4689 + }, + { + "epoch": 0.9306017163549779, + "grad_norm": 39.98605193624785, + "learning_rate": 1.1323154464597406e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.3515625, + "logps/chosen": -908.0, + "logps/rejected": -731.5, + "loss": 0.3668, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.033203125, + "rewards/margins": 8.46875, + "rewards/rejected": -6.4453125, + "step": 4690 + }, + { + "epoch": 0.9308001388957785, + "grad_norm": 35.182862111059336, + "learning_rate": 1.1315662765227102e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.2421875, + "logps/chosen": -872.5, + "logps/rejected": -979.0, + "loss": 0.2397, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.02734375, + "rewards/margins": 10.734375, + "rewards/rejected": -7.71484375, + "step": 4691 + }, + { + "epoch": 0.9309985614365792, + "grad_norm": 41.494281735681476, + "learning_rate": 1.1308192020457391e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 4.19140625, + "logps/chosen": -756.0, + "logps/rejected": -804.0, + "loss": 0.4651, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4404296875, + "rewards/margins": 7.88671875, + "rewards/rejected": -5.453125, + "step": 4692 + }, + { + "epoch": 0.9311969839773798, + "grad_norm": 28.96997431798963, + "learning_rate": 1.1300742233871857e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 3.7578125, + "logps/chosen": -864.5, + "logps/rejected": -559.5, + "loss": 0.3191, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.28125, + "rewards/margins": 8.1875, + "rewards/rejected": -5.921875, + "step": 4693 + }, + { + "epoch": 0.9313954065181804, + "grad_norm": 22.909396177719977, + "learning_rate": 1.129331340904403e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 4.03515625, + "logps/chosen": -921.0, + "logps/rejected": -610.5, + "loss": 0.4966, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.884765625, + "rewards/margins": 6.27734375, + "rewards/rejected": -4.38671875, + "step": 4694 + }, + { + "epoch": 0.9315938290589811, + "grad_norm": 33.19315662530038, + "learning_rate": 1.128590554953739e-07, + "logits/chosen": 3.70703125, + "logits/rejected": 3.84375, + "logps/chosen": -1069.0, + "logps/rejected": -889.0, + "loss": 0.3995, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.65625, + "rewards/margins": 8.42578125, + "rewards/rejected": -5.78125, + "step": 4695 + }, + { + "epoch": 0.9317922515997817, + "grad_norm": 30.707546916665745, + "learning_rate": 1.1278518658905347e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.29296875, + "logps/chosen": -887.0, + "logps/rejected": -543.0, + "loss": 0.3219, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.412109375, + "rewards/margins": 7.6875, + "rewards/rejected": -5.2734375, + "step": 4696 + }, + { + "epoch": 0.9319906741405823, + "grad_norm": 38.37845710268401, + "learning_rate": 1.1271152740691278e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.734375, + "logps/chosen": -1282.0, + "logps/rejected": -740.0, + "loss": 0.3645, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.15625, + "rewards/margins": 9.21875, + "rewards/rejected": -6.0546875, + "step": 4697 + }, + { + "epoch": 0.9321890966813831, + "grad_norm": 29.051031167922297, + "learning_rate": 1.126380779842846e-07, + "logits/chosen": 3.78125, + "logits/rejected": 4.0859375, + "logps/chosen": -811.5, + "logps/rejected": -736.5, + "loss": 0.467, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.392578125, + "rewards/margins": 7.5234375, + "rewards/rejected": -6.12890625, + "step": 4698 + }, + { + "epoch": 0.9323875192221837, + "grad_norm": 27.379837637774507, + "learning_rate": 1.1256483835640147e-07, + "logits/chosen": 3.83984375, + "logits/rejected": 3.85546875, + "logps/chosen": -869.0, + "logps/rejected": -743.0, + "loss": 0.4213, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.6748046875, + "rewards/margins": 9.57421875, + "rewards/rejected": -7.90234375, + "step": 4699 + }, + { + "epoch": 0.9325859417629843, + "grad_norm": 36.75629319581784, + "learning_rate": 1.1249180855839509e-07, + "logits/chosen": 3.5078125, + "logits/rejected": 3.390625, + "logps/chosen": -875.0, + "logps/rejected": -604.0, + "loss": 0.3133, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.982421875, + "rewards/margins": 8.953125, + "rewards/rejected": -5.9765625, + "step": 4700 + }, + { + "epoch": 0.9327843643037849, + "grad_norm": 36.45315427730646, + "learning_rate": 1.1241898862529655e-07, + "logits/chosen": 4.4453125, + "logits/rejected": 4.7265625, + "logps/chosen": -1369.0, + "logps/rejected": -1009.0, + "loss": 0.359, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.50390625, + "rewards/margins": 11.1875, + "rewards/rejected": -7.671875, + "step": 4701 + }, + { + "epoch": 0.9329827868445856, + "grad_norm": 28.908620901295244, + "learning_rate": 1.1234637859203637e-07, + "logits/chosen": 4.359375, + "logits/rejected": 4.14453125, + "logps/chosen": -742.0, + "logps/rejected": -744.0, + "loss": 0.487, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.046875, + "rewards/margins": 6.60546875, + "rewards/rejected": -4.5595703125, + "step": 4702 + }, + { + "epoch": 0.9331812093853862, + "grad_norm": 41.579741232756476, + "learning_rate": 1.1227397849344414e-07, + "logits/chosen": 4.05078125, + "logits/rejected": 3.7265625, + "logps/chosen": -961.0, + "logps/rejected": -630.0, + "loss": 0.3498, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.1875, + "rewards/margins": 9.7265625, + "rewards/rejected": -7.5234375, + "step": 4703 + }, + { + "epoch": 0.9333796319261868, + "grad_norm": 27.655816815194516, + "learning_rate": 1.1220178836424902e-07, + "logits/chosen": 3.73046875, + "logits/rejected": 3.5546875, + "logps/chosen": -820.75, + "logps/rejected": -552.5, + "loss": 0.5562, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.25927734375, + "rewards/margins": 5.11328125, + "rewards/rejected": -3.84375, + "step": 4704 + }, + { + "epoch": 0.9335780544669875, + "grad_norm": 26.89738932144832, + "learning_rate": 1.1212980823907929e-07, + "logits/chosen": 3.8515625, + "logits/rejected": 3.984375, + "logps/chosen": -891.0, + "logps/rejected": -764.0, + "loss": 0.433, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.43359375, + "rewards/margins": 6.828125, + "rewards/rejected": -4.390625, + "step": 4705 + }, + { + "epoch": 0.9337764770077881, + "grad_norm": 48.344774211665836, + "learning_rate": 1.1205803815246249e-07, + "logits/chosen": 3.7109375, + "logits/rejected": 3.78515625, + "logps/chosen": -861.0, + "logps/rejected": -810.0, + "loss": 0.4525, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8828125, + "rewards/margins": 7.578125, + "rewards/rejected": -5.703125, + "step": 4706 + }, + { + "epoch": 0.9339748995485887, + "grad_norm": 32.715670680633096, + "learning_rate": 1.1198647813882561e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.234375, + "logps/chosen": -1325.0, + "logps/rejected": -977.0, + "loss": 0.446, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.73046875, + "rewards/margins": 8.71875, + "rewards/rejected": -5.9921875, + "step": 4707 + }, + { + "epoch": 0.9341733220893893, + "grad_norm": 36.460939897098775, + "learning_rate": 1.119151282324945e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.1796875, + "logps/chosen": -810.5, + "logps/rejected": -661.0, + "loss": 0.5216, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.462890625, + "rewards/margins": 5.57421875, + "rewards/rejected": -3.108642578125, + "step": 4708 + }, + { + "epoch": 0.93437174463019, + "grad_norm": 42.54224702619031, + "learning_rate": 1.1184398846769459e-07, + "logits/chosen": 4.04296875, + "logits/rejected": 4.05078125, + "logps/chosen": -1396.0, + "logps/rejected": -937.0, + "loss": 0.2879, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.306640625, + "rewards/margins": 9.265625, + "rewards/rejected": -5.953125, + "step": 4709 + }, + { + "epoch": 0.9345701671709906, + "grad_norm": 35.517384650002825, + "learning_rate": 1.1177305887855037e-07, + "logits/chosen": 3.78515625, + "logits/rejected": 3.86328125, + "logps/chosen": -831.0, + "logps/rejected": -995.0, + "loss": 0.4552, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9375, + "rewards/margins": 7.5859375, + "rewards/rejected": -5.6484375, + "step": 4710 + }, + { + "epoch": 0.9347685897117912, + "grad_norm": 37.59162379794367, + "learning_rate": 1.1170233949908542e-07, + "logits/chosen": 4.11328125, + "logits/rejected": 4.27734375, + "logps/chosen": -602.5, + "logps/rejected": -664.0, + "loss": 0.6014, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.5234375, + "rewards/margins": 4.96875, + "rewards/rejected": -3.44384765625, + "step": 4711 + }, + { + "epoch": 0.9349670122525919, + "grad_norm": 33.02152404681411, + "learning_rate": 1.1163183036322264e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 3.734375, + "logps/chosen": -948.0, + "logps/rejected": -841.0, + "loss": 0.3544, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.173828125, + "rewards/margins": 8.8984375, + "rewards/rejected": -5.71875, + "step": 4712 + }, + { + "epoch": 0.9351654347933925, + "grad_norm": 31.912268478612557, + "learning_rate": 1.1156153150478397e-07, + "logits/chosen": 3.890625, + "logits/rejected": 3.84765625, + "logps/chosen": -1107.0, + "logps/rejected": -1667.0, + "loss": 0.46, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.46875, + "rewards/margins": 12.0234375, + "rewards/rejected": -9.5625, + "step": 4713 + }, + { + "epoch": 0.9353638573341931, + "grad_norm": 30.58783755518533, + "learning_rate": 1.1149144295749052e-07, + "logits/chosen": 4.05078125, + "logits/rejected": 3.609375, + "logps/chosen": -1111.0, + "logps/rejected": -816.0, + "loss": 0.5213, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.94140625, + "rewards/margins": 8.20703125, + "rewards/rejected": -5.28125, + "step": 4714 + }, + { + "epoch": 0.9355622798749939, + "grad_norm": 26.073197424839677, + "learning_rate": 1.1142156475496253e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.3046875, + "logps/chosen": -928.0, + "logps/rejected": -828.0, + "loss": 0.4281, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.38671875, + "rewards/margins": 7.7734375, + "rewards/rejected": -5.3828125, + "step": 4715 + }, + { + "epoch": 0.9357607024157945, + "grad_norm": 28.429312726587554, + "learning_rate": 1.1135189693071936e-07, + "logits/chosen": 4.4375, + "logits/rejected": 4.6875, + "logps/chosen": -706.0, + "logps/rejected": -855.5, + "loss": 0.3481, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.146484375, + "rewards/margins": 7.640625, + "rewards/rejected": -5.49609375, + "step": 4716 + }, + { + "epoch": 0.9359591249565951, + "grad_norm": 32.93877322828673, + "learning_rate": 1.1128243951817936e-07, + "logits/chosen": 3.8359375, + "logits/rejected": 3.60546875, + "logps/chosen": -1053.0, + "logps/rejected": -1692.5, + "loss": 0.3676, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.07421875, + "rewards/margins": 10.1796875, + "rewards/rejected": -7.10546875, + "step": 4717 + }, + { + "epoch": 0.9361575474973957, + "grad_norm": 27.952038688276705, + "learning_rate": 1.1121319255066014e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.09375, + "logps/chosen": -831.0, + "logps/rejected": -1030.0, + "loss": 0.4547, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.091796875, + "rewards/margins": 9.5234375, + "rewards/rejected": -7.44140625, + "step": 4718 + }, + { + "epoch": 0.9363559700381964, + "grad_norm": 34.484236426964394, + "learning_rate": 1.1114415606137814e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 3.86328125, + "logps/chosen": -871.0, + "logps/rejected": -636.5, + "loss": 0.4885, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.345703125, + "rewards/margins": 6.1015625, + "rewards/rejected": -4.7578125, + "step": 4719 + }, + { + "epoch": 0.936554392578997, + "grad_norm": 18.94440720491457, + "learning_rate": 1.1107533008344893e-07, + "logits/chosen": 4.44140625, + "logits/rejected": 4.3984375, + "logps/chosen": -1155.0, + "logps/rejected": -691.5, + "loss": 0.4403, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.787109375, + "rewards/margins": 6.9765625, + "rewards/rejected": -4.1953125, + "step": 4720 + }, + { + "epoch": 0.9367528151197976, + "grad_norm": 28.628484252489724, + "learning_rate": 1.1100671464988713e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.28125, + "logps/chosen": -1097.0, + "logps/rejected": -738.0, + "loss": 0.3567, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.572265625, + "rewards/margins": 8.5, + "rewards/rejected": -5.9296875, + "step": 4721 + }, + { + "epoch": 0.9369512376605983, + "grad_norm": 24.311440085731007, + "learning_rate": 1.1093830979360638e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.3359375, + "logps/chosen": -745.0, + "logps/rejected": -674.0, + "loss": 0.4323, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.52734375, + "rewards/margins": 7.2890625, + "rewards/rejected": -4.7578125, + "step": 4722 + }, + { + "epoch": 0.9371496602013989, + "grad_norm": 23.839110300533175, + "learning_rate": 1.1087011554741926e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.1328125, + "logps/chosen": -1301.0, + "logps/rejected": -1100.0, + "loss": 0.2734, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.98046875, + "rewards/margins": 11.625, + "rewards/rejected": -8.671875, + "step": 4723 + }, + { + "epoch": 0.9373480827421995, + "grad_norm": 27.145099906753774, + "learning_rate": 1.1080213194403728e-07, + "logits/chosen": 3.7421875, + "logits/rejected": 4.19921875, + "logps/chosen": -905.5, + "logps/rejected": -918.0, + "loss": 0.4378, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.1484375, + "rewards/margins": 8.46875, + "rewards/rejected": -5.3203125, + "step": 4724 + }, + { + "epoch": 0.9375465052830001, + "grad_norm": 35.234172562398626, + "learning_rate": 1.1073435901607104e-07, + "logits/chosen": 3.90625, + "logits/rejected": 4.16015625, + "logps/chosen": -846.0, + "logps/rejected": -860.0, + "loss": 0.4482, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0, + "rewards/margins": 7.546875, + "rewards/rejected": -5.52734375, + "step": 4725 + }, + { + "epoch": 0.9377449278238008, + "grad_norm": 25.19544982453978, + "learning_rate": 1.1066679679602998e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.0390625, + "logps/chosen": -1090.0, + "logps/rejected": -838.0, + "loss": 0.4503, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.76953125, + "rewards/margins": 8.4453125, + "rewards/rejected": -5.6796875, + "step": 4726 + }, + { + "epoch": 0.9379433503646014, + "grad_norm": 26.984871248578823, + "learning_rate": 1.1059944531632257e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 3.76953125, + "logps/chosen": -1358.0, + "logps/rejected": -764.0, + "loss": 0.2962, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.03125, + "rewards/margins": 9.3671875, + "rewards/rejected": -6.3359375, + "step": 4727 + }, + { + "epoch": 0.938141772905402, + "grad_norm": 23.26849557316423, + "learning_rate": 1.1053230460925591e-07, + "logits/chosen": 4.796875, + "logits/rejected": 4.4609375, + "logps/chosen": -1118.0, + "logps/rejected": -755.5, + "loss": 0.3656, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.728515625, + "rewards/margins": 8.9765625, + "rewards/rejected": -5.265625, + "step": 4728 + }, + { + "epoch": 0.9383401954462027, + "grad_norm": 26.009380287065795, + "learning_rate": 1.1046537470703646e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 4.03125, + "logps/chosen": -855.0, + "logps/rejected": -806.0, + "loss": 0.1727, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.10546875, + "rewards/margins": 14.546875, + "rewards/rejected": -10.44140625, + "step": 4729 + }, + { + "epoch": 0.9385386179870033, + "grad_norm": 33.11081633918848, + "learning_rate": 1.1039865564176915e-07, + "logits/chosen": 3.80078125, + "logits/rejected": 4.125, + "logps/chosen": -1313.0, + "logps/rejected": -1426.0, + "loss": 0.2887, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.71484375, + "rewards/margins": 11.8125, + "rewards/rejected": -8.125, + "step": 4730 + }, + { + "epoch": 0.9387370405278039, + "grad_norm": 32.80867399179877, + "learning_rate": 1.1033214744545794e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.4296875, + "logps/chosen": -721.0, + "logps/rejected": -537.5, + "loss": 0.3224, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.51171875, + "rewards/margins": 7.5390625, + "rewards/rejected": -5.03125, + "step": 4731 + }, + { + "epoch": 0.9389354630686046, + "grad_norm": 31.783561040999288, + "learning_rate": 1.1026585015000575e-07, + "logits/chosen": 3.6328125, + "logits/rejected": 3.76171875, + "logps/chosen": -939.0, + "logps/rejected": -596.5, + "loss": 0.4219, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.173828125, + "rewards/margins": 7.078125, + "rewards/rejected": -4.90234375, + "step": 4732 + }, + { + "epoch": 0.9391338856094053, + "grad_norm": 31.235428683639295, + "learning_rate": 1.1019976378721399e-07, + "logits/chosen": 4.125, + "logits/rejected": 3.89453125, + "logps/chosen": -1100.0, + "logps/rejected": -614.5, + "loss": 0.4246, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3662109375, + "rewards/margins": 7.5, + "rewards/rejected": -5.12890625, + "step": 4733 + }, + { + "epoch": 0.9393323081502059, + "grad_norm": 28.723697520001647, + "learning_rate": 1.1013388838878332e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 3.79296875, + "logps/chosen": -896.0, + "logps/rejected": -669.0, + "loss": 0.348, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.28515625, + "rewards/margins": 7.90625, + "rewards/rejected": -4.640625, + "step": 4734 + }, + { + "epoch": 0.9395307306910065, + "grad_norm": 34.287584056246835, + "learning_rate": 1.1006822398631296e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.09765625, + "logps/chosen": -848.0, + "logps/rejected": -519.5, + "loss": 0.4469, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.599609375, + "rewards/margins": 6.359375, + "rewards/rejected": -3.755859375, + "step": 4735 + }, + { + "epoch": 0.9397291532318072, + "grad_norm": 32.63320988848215, + "learning_rate": 1.100027706113009e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.25390625, + "logps/chosen": -1009.0, + "logps/rejected": -697.0, + "loss": 0.4069, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.16796875, + "rewards/margins": 7.640625, + "rewards/rejected": -5.484375, + "step": 4736 + }, + { + "epoch": 0.9399275757726078, + "grad_norm": 32.05714069515938, + "learning_rate": 1.0993752829514398e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.1640625, + "logps/chosen": -1091.0, + "logps/rejected": -1044.0, + "loss": 0.5198, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.8115234375, + "rewards/margins": 7.91796875, + "rewards/rejected": -6.111328125, + "step": 4737 + }, + { + "epoch": 0.9401259983134084, + "grad_norm": 34.913213027371846, + "learning_rate": 1.0987249706913778e-07, + "logits/chosen": 4.33203125, + "logits/rejected": 4.02734375, + "logps/chosen": -1173.0, + "logps/rejected": -1108.0, + "loss": 0.277, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.71875, + "rewards/margins": 10.6875, + "rewards/rejected": -6.953125, + "step": 4738 + }, + { + "epoch": 0.9403244208542091, + "grad_norm": 33.907654939934645, + "learning_rate": 1.0980767696447667e-07, + "logits/chosen": 3.71875, + "logits/rejected": 3.796875, + "logps/chosen": -976.0, + "logps/rejected": -940.0, + "loss": 0.5135, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.70654296875, + "rewards/margins": 8.3203125, + "rewards/rejected": -6.59375, + "step": 4739 + }, + { + "epoch": 0.9405228433950097, + "grad_norm": 30.816749817700533, + "learning_rate": 1.097430680122536e-07, + "logits/chosen": 4.15234375, + "logits/rejected": 4.14453125, + "logps/chosen": -1242.0, + "logps/rejected": -775.0, + "loss": 0.5292, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.83984375, + "rewards/margins": 6.9296875, + "rewards/rejected": -4.09765625, + "step": 4740 + }, + { + "epoch": 0.9407212659358103, + "grad_norm": 31.092744498345013, + "learning_rate": 1.0967867024346044e-07, + "logits/chosen": 4.13671875, + "logits/rejected": 3.6796875, + "logps/chosen": -925.0, + "logps/rejected": -546.5, + "loss": 0.3248, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.1796875, + "rewards/margins": 7.640625, + "rewards/rejected": -5.4375, + "step": 4741 + }, + { + "epoch": 0.9409196884766109, + "grad_norm": 45.659265417697924, + "learning_rate": 1.096144836889876e-07, + "logits/chosen": 3.7578125, + "logits/rejected": 3.78515625, + "logps/chosen": -832.0, + "logps/rejected": -778.0, + "loss": 0.4583, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.01025390625, + "rewards/margins": 7.796875, + "rewards/rejected": -5.76953125, + "step": 4742 + }, + { + "epoch": 0.9411181110174116, + "grad_norm": 29.235370167332942, + "learning_rate": 1.0955050837962426e-07, + "logits/chosen": 4.40234375, + "logits/rejected": 4.37890625, + "logps/chosen": -914.0, + "logps/rejected": -511.0, + "loss": 0.4848, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.220703125, + "rewards/margins": 7.140625, + "rewards/rejected": -3.93359375, + "step": 4743 + }, + { + "epoch": 0.9413165335582122, + "grad_norm": 35.81759141930152, + "learning_rate": 1.0948674434605823e-07, + "logits/chosen": 3.83984375, + "logits/rejected": 3.9375, + "logps/chosen": -1072.0, + "logps/rejected": -792.0, + "loss": 0.3558, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4296875, + "rewards/margins": 8.5, + "rewards/rejected": -5.07421875, + "step": 4744 + }, + { + "epoch": 0.9415149560990128, + "grad_norm": 24.50136566557793, + "learning_rate": 1.0942319161887595e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 3.81640625, + "logps/chosen": -1059.0, + "logps/rejected": -675.0, + "loss": 0.3313, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3681640625, + "rewards/margins": 9.4765625, + "rewards/rejected": -7.0859375, + "step": 4745 + }, + { + "epoch": 0.9417133786398135, + "grad_norm": 28.54697286686941, + "learning_rate": 1.0935985022856256e-07, + "logits/chosen": 3.84375, + "logits/rejected": 3.69140625, + "logps/chosen": -947.0, + "logps/rejected": -617.0, + "loss": 0.3378, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.705078125, + "rewards/margins": 8.5234375, + "rewards/rejected": -5.8203125, + "step": 4746 + }, + { + "epoch": 0.9419118011806141, + "grad_norm": 35.97469971874578, + "learning_rate": 1.0929672020550183e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 3.69140625, + "logps/chosen": -727.0, + "logps/rejected": -1166.0, + "loss": 0.4255, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.357421875, + "rewards/margins": 8.859375, + "rewards/rejected": -6.5078125, + "step": 4747 + }, + { + "epoch": 0.9421102237214147, + "grad_norm": 37.01320258067931, + "learning_rate": 1.0923380157997608e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.171875, + "logps/chosen": -1077.0, + "logps/rejected": -1213.0, + "loss": 0.4686, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.318359375, + "rewards/margins": 8.2890625, + "rewards/rejected": -6.96484375, + "step": 4748 + }, + { + "epoch": 0.9423086462622153, + "grad_norm": 39.92437064719488, + "learning_rate": 1.0917109438216626e-07, + "logits/chosen": 4.390625, + "logits/rejected": 4.1171875, + "logps/chosen": -1166.0, + "logps/rejected": -619.5, + "loss": 0.4266, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.24951171875, + "rewards/margins": 8.1875, + "rewards/rejected": -4.9375, + "step": 4749 + }, + { + "epoch": 0.942507068803016, + "grad_norm": 31.242104735202865, + "learning_rate": 1.0910859864215192e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 3.98828125, + "logps/chosen": -1014.0, + "logps/rejected": -789.0, + "loss": 0.492, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.6455078125, + "rewards/margins": 6.890625, + "rewards/rejected": -5.2421875, + "step": 4750 + }, + { + "epoch": 0.9427054913438166, + "grad_norm": 24.737546514853413, + "learning_rate": 1.0904631438991106e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.4140625, + "logps/chosen": -649.0, + "logps/rejected": -756.0, + "loss": 0.3421, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.68359375, + "rewards/margins": 9.125, + "rewards/rejected": -6.4453125, + "step": 4751 + }, + { + "epoch": 0.9429039138846173, + "grad_norm": 41.16503741195452, + "learning_rate": 1.089842416553205e-07, + "logits/chosen": 4.046875, + "logits/rejected": 3.9453125, + "logps/chosen": -1304.0, + "logps/rejected": -977.0, + "loss": 0.3746, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8564453125, + "rewards/margins": 10.7734375, + "rewards/rejected": -7.9375, + "step": 4752 + }, + { + "epoch": 0.943102336425418, + "grad_norm": 22.372259581100245, + "learning_rate": 1.0892238046815527e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.02734375, + "logps/chosen": -1051.0, + "logps/rejected": -837.0, + "loss": 0.2683, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.79296875, + "rewards/margins": 11.046875, + "rewards/rejected": -8.25, + "step": 4753 + }, + { + "epoch": 0.9433007589662186, + "grad_norm": 35.99022825695638, + "learning_rate": 1.088607308580892e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 4.01171875, + "logps/chosen": -819.0, + "logps/rejected": -777.5, + "loss": 0.4781, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.123046875, + "rewards/margins": 6.64453125, + "rewards/rejected": -4.51171875, + "step": 4754 + }, + { + "epoch": 0.9434991815070192, + "grad_norm": 33.516537567555396, + "learning_rate": 1.0879929285469445e-07, + "logits/chosen": 4.640625, + "logits/rejected": 4.4140625, + "logps/chosen": -1125.0, + "logps/rejected": -643.0, + "loss": 0.3666, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.53515625, + "rewards/margins": 8.640625, + "rewards/rejected": -6.09375, + "step": 4755 + }, + { + "epoch": 0.9436976040478199, + "grad_norm": 40.36628865507223, + "learning_rate": 1.0873806648744165e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 3.828125, + "logps/chosen": -1054.0, + "logps/rejected": -1625.0, + "loss": 0.3767, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.96484375, + "rewards/margins": 10.984375, + "rewards/rejected": -8.015625, + "step": 4756 + }, + { + "epoch": 0.9438960265886205, + "grad_norm": 28.493865146325938, + "learning_rate": 1.0867705178570022e-07, + "logits/chosen": 3.51953125, + "logits/rejected": 3.67578125, + "logps/chosen": -920.5, + "logps/rejected": -1465.5, + "loss": 0.4995, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.555419921875, + "rewards/margins": 9.46875, + "rewards/rejected": -7.9140625, + "step": 4757 + }, + { + "epoch": 0.9440944491294211, + "grad_norm": 30.41000626836164, + "learning_rate": 1.0861624877873766e-07, + "logits/chosen": 4.36328125, + "logits/rejected": 4.35546875, + "logps/chosen": -1277.0, + "logps/rejected": -923.0, + "loss": 0.4284, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.171875, + "rewards/margins": 9.5390625, + "rewards/rejected": -6.37890625, + "step": 4758 + }, + { + "epoch": 0.9442928716702217, + "grad_norm": 28.26813921985753, + "learning_rate": 1.0855565749572015e-07, + "logits/chosen": 4.23046875, + "logits/rejected": 4.24609375, + "logps/chosen": -974.0, + "logps/rejected": -736.5, + "loss": 0.3983, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.796875, + "rewards/margins": 10.1953125, + "rewards/rejected": -6.390625, + "step": 4759 + }, + { + "epoch": 0.9444912942110224, + "grad_norm": 31.06175076427107, + "learning_rate": 1.0849527796571225e-07, + "logits/chosen": 4.28515625, + "logits/rejected": 4.29296875, + "logps/chosen": -899.0, + "logps/rejected": -640.5, + "loss": 0.4669, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.361328125, + "rewards/margins": 7.06640625, + "rewards/rejected": -4.7109375, + "step": 4760 + }, + { + "epoch": 0.944689716751823, + "grad_norm": 33.40670189618193, + "learning_rate": 1.0843511021767689e-07, + "logits/chosen": 4.4765625, + "logits/rejected": 4.3984375, + "logps/chosen": -794.0, + "logps/rejected": -567.0, + "loss": 0.418, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.578125, + "rewards/margins": 7.9921875, + "rewards/rejected": -5.412109375, + "step": 4761 + }, + { + "epoch": 0.9448881392926236, + "grad_norm": 47.47817833676104, + "learning_rate": 1.0837515428047555e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 4.06640625, + "logps/chosen": -954.0, + "logps/rejected": -829.0, + "loss": 0.4518, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.482421875, + "rewards/margins": 10.078125, + "rewards/rejected": -7.6015625, + "step": 4762 + }, + { + "epoch": 0.9450865618334243, + "grad_norm": 28.986858163514835, + "learning_rate": 1.08315410182868e-07, + "logits/chosen": 4.2109375, + "logits/rejected": 4.14453125, + "logps/chosen": -1371.0, + "logps/rejected": -734.0, + "loss": 0.4537, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.48388671875, + "rewards/margins": 7.18359375, + "rewards/rejected": -5.69921875, + "step": 4763 + }, + { + "epoch": 0.9452849843742249, + "grad_norm": 31.060913839815935, + "learning_rate": 1.0825587795351246e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 3.98828125, + "logps/chosen": -822.5, + "logps/rejected": -602.0, + "loss": 0.4086, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.09716796875, + "rewards/margins": 8.01171875, + "rewards/rejected": -5.921875, + "step": 4764 + }, + { + "epoch": 0.9454834069150255, + "grad_norm": 26.841550907847047, + "learning_rate": 1.0819655762096538e-07, + "logits/chosen": 3.19921875, + "logits/rejected": 3.2109375, + "logps/chosen": -911.0, + "logps/rejected": -846.0, + "loss": 0.5066, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.314453125, + "rewards/margins": 16.46875, + "rewards/rejected": -14.109375, + "step": 4765 + }, + { + "epoch": 0.9456818294558261, + "grad_norm": 30.26384551189497, + "learning_rate": 1.081374492136818e-07, + "logits/chosen": 4.23828125, + "logits/rejected": 4.1171875, + "logps/chosen": -1286.0, + "logps/rejected": -978.0, + "loss": 0.3441, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.185546875, + "rewards/margins": 10.0625, + "rewards/rejected": -6.8984375, + "step": 4766 + }, + { + "epoch": 0.9458802519966268, + "grad_norm": 27.86221938942042, + "learning_rate": 1.0807855276001487e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 3.640625, + "logps/chosen": -919.0, + "logps/rejected": -640.5, + "loss": 0.4704, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.3203125, + "rewards/margins": 6.8828125, + "rewards/rejected": -4.56640625, + "step": 4767 + }, + { + "epoch": 0.9460786745374274, + "grad_norm": 34.51132649868578, + "learning_rate": 1.080198682882162e-07, + "logits/chosen": 3.6953125, + "logits/rejected": 3.703125, + "logps/chosen": -1010.0, + "logps/rejected": -818.0, + "loss": 0.4012, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.166015625, + "rewards/margins": 8.078125, + "rewards/rejected": -5.90625, + "step": 4768 + }, + { + "epoch": 0.946277097078228, + "grad_norm": 35.255228520287226, + "learning_rate": 1.0796139582643573e-07, + "logits/chosen": 4.4609375, + "logits/rejected": 4.4765625, + "logps/chosen": -1002.5, + "logps/rejected": -879.0, + "loss": 0.5425, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.666015625, + "rewards/margins": 7.78515625, + "rewards/rejected": -5.12109375, + "step": 4769 + }, + { + "epoch": 0.9464755196190288, + "grad_norm": 30.56973642848757, + "learning_rate": 1.0790313540272156e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 3.97265625, + "logps/chosen": -1203.0, + "logps/rejected": -753.0, + "loss": 0.3367, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.072265625, + "rewards/margins": 8.1953125, + "rewards/rejected": -6.125, + "step": 4770 + }, + { + "epoch": 0.9466739421598294, + "grad_norm": 34.71427516053364, + "learning_rate": 1.0784508704502029e-07, + "logits/chosen": 4.390625, + "logits/rejected": 4.15625, + "logps/chosen": -1202.0, + "logps/rejected": -723.5, + "loss": 0.4242, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.22265625, + "rewards/margins": 7.12890625, + "rewards/rejected": -4.90625, + "step": 4771 + }, + { + "epoch": 0.94687236470063, + "grad_norm": 32.63545032787796, + "learning_rate": 1.0778725078117667e-07, + "logits/chosen": 3.65625, + "logits/rejected": 3.71484375, + "logps/chosen": -1093.0, + "logps/rejected": -942.0, + "loss": 0.3703, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.080078125, + "rewards/margins": 8.2890625, + "rewards/rejected": -5.1875, + "step": 4772 + }, + { + "epoch": 0.9470707872414307, + "grad_norm": 30.369390001264318, + "learning_rate": 1.0772962663893367e-07, + "logits/chosen": 3.54296875, + "logits/rejected": 3.765625, + "logps/chosen": -963.0, + "logps/rejected": -589.0, + "loss": 0.4694, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.76171875, + "rewards/margins": 6.21484375, + "rewards/rejected": -4.462890625, + "step": 4773 + }, + { + "epoch": 0.9472692097822313, + "grad_norm": 34.515129025080185, + "learning_rate": 1.0767221464593263e-07, + "logits/chosen": 4.7265625, + "logits/rejected": 4.625, + "logps/chosen": -1083.0, + "logps/rejected": -1819.0, + "loss": 0.3968, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.30859375, + "rewards/margins": 11.9140625, + "rewards/rejected": -8.609375, + "step": 4774 + }, + { + "epoch": 0.9474676323230319, + "grad_norm": 25.770362220651297, + "learning_rate": 1.0761501482971304e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.08203125, + "logps/chosen": -1035.0, + "logps/rejected": -685.0, + "loss": 0.4238, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.736328125, + "rewards/margins": 7.41796875, + "rewards/rejected": -5.67578125, + "step": 4775 + }, + { + "epoch": 0.9476660548638325, + "grad_norm": 39.41375447394975, + "learning_rate": 1.0755802721771254e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.51953125, + "logps/chosen": -1074.0, + "logps/rejected": -721.0, + "loss": 0.3612, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.88671875, + "rewards/margins": 9.4296875, + "rewards/rejected": -5.53515625, + "step": 4776 + }, + { + "epoch": 0.9478644774046332, + "grad_norm": 32.80908938265565, + "learning_rate": 1.0750125183726727e-07, + "logits/chosen": 3.796875, + "logits/rejected": 3.83203125, + "logps/chosen": -960.0, + "logps/rejected": -774.5, + "loss": 0.463, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.1279296875, + "rewards/margins": 8.4921875, + "rewards/rejected": -6.359375, + "step": 4777 + }, + { + "epoch": 0.9480628999454338, + "grad_norm": 29.94719732477378, + "learning_rate": 1.0744468871561125e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.984375, + "logps/chosen": -1029.0, + "logps/rejected": -722.5, + "loss": 0.3535, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.55859375, + "rewards/margins": 8.2421875, + "rewards/rejected": -5.67578125, + "step": 4778 + }, + { + "epoch": 0.9482613224862344, + "grad_norm": 33.38393428425856, + "learning_rate": 1.0738833787987677e-07, + "logits/chosen": 4.37109375, + "logits/rejected": 4.3046875, + "logps/chosen": -1921.0, + "logps/rejected": -758.0, + "loss": 0.4308, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.859375, + "rewards/margins": 6.81640625, + "rewards/rejected": -5.94921875, + "step": 4779 + }, + { + "epoch": 0.9484597450270351, + "grad_norm": 24.149979735266236, + "learning_rate": 1.073321993570944e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 3.828125, + "logps/chosen": -1458.0, + "logps/rejected": -794.0, + "loss": 0.3052, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.23828125, + "rewards/margins": 8.96875, + "rewards/rejected": -5.7421875, + "step": 4780 + }, + { + "epoch": 0.9486581675678357, + "grad_norm": 28.65645202785235, + "learning_rate": 1.0727627317419283e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 3.84765625, + "logps/chosen": -960.0, + "logps/rejected": -720.0, + "loss": 0.2945, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.71875, + "rewards/margins": 8.984375, + "rewards/rejected": -6.265625, + "step": 4781 + }, + { + "epoch": 0.9488565901086363, + "grad_norm": 19.48959785010418, + "learning_rate": 1.072205593579987e-07, + "logits/chosen": 4.65625, + "logits/rejected": 4.5078125, + "logps/chosen": -977.0, + "logps/rejected": -679.0, + "loss": 0.4155, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.072265625, + "rewards/margins": 7.96875, + "rewards/rejected": -4.9140625, + "step": 4782 + }, + { + "epoch": 0.9490550126494369, + "grad_norm": 38.16317415488005, + "learning_rate": 1.0716505793523708e-07, + "logits/chosen": 4.30078125, + "logits/rejected": 3.94140625, + "logps/chosen": -1274.0, + "logps/rejected": -649.0, + "loss": 0.3976, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.4453125, + "rewards/margins": 6.890625, + "rewards/rejected": -5.44140625, + "step": 4783 + }, + { + "epoch": 0.9492534351902376, + "grad_norm": 35.05397369742048, + "learning_rate": 1.0710976893253101e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 4.3125, + "logps/chosen": -827.0, + "logps/rejected": -677.0, + "loss": 0.5371, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.654296875, + "rewards/margins": 6.65625, + "rewards/rejected": -3.994140625, + "step": 4784 + }, + { + "epoch": 0.9494518577310382, + "grad_norm": 27.654768098750253, + "learning_rate": 1.070546923764015e-07, + "logits/chosen": 4.515625, + "logits/rejected": 4.421875, + "logps/chosen": -1526.0, + "logps/rejected": -790.0, + "loss": 0.4188, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0859375, + "rewards/margins": 8.8828125, + "rewards/rejected": -5.8046875, + "step": 4785 + }, + { + "epoch": 0.9496502802718388, + "grad_norm": 28.12418770648361, + "learning_rate": 1.0699982829326797e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 3.7734375, + "logps/chosen": -907.0, + "logps/rejected": -780.0, + "loss": 0.2526, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6875, + "rewards/margins": 8.4453125, + "rewards/rejected": -5.765625, + "step": 4786 + }, + { + "epoch": 0.9498487028126396, + "grad_norm": 29.612816358545867, + "learning_rate": 1.0694517670944766e-07, + "logits/chosen": 4.09375, + "logits/rejected": 4.046875, + "logps/chosen": -1248.0, + "logps/rejected": -894.0, + "loss": 0.2787, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6171875, + "rewards/margins": 9.09375, + "rewards/rejected": -6.4765625, + "step": 4787 + }, + { + "epoch": 0.9500471253534402, + "grad_norm": 26.7545903667229, + "learning_rate": 1.0689073765115598e-07, + "logits/chosen": 4.171875, + "logits/rejected": 3.92578125, + "logps/chosen": -1157.5, + "logps/rejected": -624.5, + "loss": 0.4318, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.279296875, + "rewards/margins": 7.3359375, + "rewards/rejected": -4.0595703125, + "step": 4788 + }, + { + "epoch": 0.9502455478942408, + "grad_norm": 30.308171077399212, + "learning_rate": 1.068365111445064e-07, + "logits/chosen": 4.046875, + "logits/rejected": 4.33203125, + "logps/chosen": -768.0, + "logps/rejected": -932.0, + "loss": 0.4693, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.53125, + "rewards/margins": 6.65625, + "rewards/rejected": -5.12109375, + "step": 4789 + }, + { + "epoch": 0.9504439704350415, + "grad_norm": 23.302663882120317, + "learning_rate": 1.0678249721551033e-07, + "logits/chosen": 4.01171875, + "logits/rejected": 4.0390625, + "logps/chosen": -1201.0, + "logps/rejected": -907.0, + "loss": 0.3031, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.83984375, + "rewards/margins": 10.25, + "rewards/rejected": -6.3984375, + "step": 4790 + }, + { + "epoch": 0.9506423929758421, + "grad_norm": 37.98381892600821, + "learning_rate": 1.0672869589007738e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 4.13671875, + "logps/chosen": -1344.0, + "logps/rejected": -931.0, + "loss": 0.4621, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.587890625, + "rewards/margins": 7.953125, + "rewards/rejected": -5.3515625, + "step": 4791 + }, + { + "epoch": 0.9508408155166427, + "grad_norm": 28.872317022638214, + "learning_rate": 1.0667510719401508e-07, + "logits/chosen": 4.29296875, + "logits/rejected": 4.00390625, + "logps/chosen": -1331.0, + "logps/rejected": -1034.0, + "loss": 0.5081, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.501953125, + "rewards/margins": 8.703125, + "rewards/rejected": -5.1953125, + "step": 4792 + }, + { + "epoch": 0.9510392380574433, + "grad_norm": 29.19763608589632, + "learning_rate": 1.0662173115302899e-07, + "logits/chosen": 3.64453125, + "logits/rejected": 3.8203125, + "logps/chosen": -785.0, + "logps/rejected": -590.0, + "loss": 0.4756, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.06103515625, + "rewards/margins": 7.0625, + "rewards/rejected": -4.98046875, + "step": 4793 + }, + { + "epoch": 0.951237660598244, + "grad_norm": 21.81506238676984, + "learning_rate": 1.0656856779272256e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.9765625, + "logps/chosen": -1005.0, + "logps/rejected": -735.0, + "loss": 0.343, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.69921875, + "rewards/margins": 9.4453125, + "rewards/rejected": -6.7421875, + "step": 4794 + }, + { + "epoch": 0.9514360831390446, + "grad_norm": 36.43091271463368, + "learning_rate": 1.0651561713859734e-07, + "logits/chosen": 4.640625, + "logits/rejected": 4.30859375, + "logps/chosen": -1289.0, + "logps/rejected": -961.0, + "loss": 0.4687, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.61328125, + "rewards/margins": 7.796875, + "rewards/rejected": -5.171875, + "step": 4795 + }, + { + "epoch": 0.9516345056798452, + "grad_norm": 28.777642781268362, + "learning_rate": 1.0646287921605288e-07, + "logits/chosen": 4.34375, + "logits/rejected": 4.22265625, + "logps/chosen": -867.5, + "logps/rejected": -685.0, + "loss": 0.3858, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.484375, + "rewards/margins": 6.8046875, + "rewards/rejected": -4.33203125, + "step": 4796 + }, + { + "epoch": 0.9518329282206459, + "grad_norm": 37.306252972753185, + "learning_rate": 1.0641035405038659e-07, + "logits/chosen": 3.703125, + "logits/rejected": 3.796875, + "logps/chosen": -1053.0, + "logps/rejected": -812.5, + "loss": 0.4126, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.01171875, + "rewards/margins": 8.1953125, + "rewards/rejected": -6.171875, + "step": 4797 + }, + { + "epoch": 0.9520313507614465, + "grad_norm": 30.951922802222427, + "learning_rate": 1.0635804166679385e-07, + "logits/chosen": 4.0, + "logits/rejected": 4.04296875, + "logps/chosen": -1098.0, + "logps/rejected": -850.0, + "loss": 0.3408, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7109375, + "rewards/margins": 8.546875, + "rewards/rejected": -5.83203125, + "step": 4798 + }, + { + "epoch": 0.9522297733022471, + "grad_norm": 28.227946422015243, + "learning_rate": 1.06305942090368e-07, + "logits/chosen": 3.78515625, + "logits/rejected": 3.7421875, + "logps/chosen": -1126.0, + "logps/rejected": -929.0, + "loss": 0.2407, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.88671875, + "rewards/margins": 20.3515625, + "rewards/rejected": -17.49609375, + "step": 4799 + }, + { + "epoch": 0.9524281958430477, + "grad_norm": 31.49414755334552, + "learning_rate": 1.0625405534610032e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 4.08203125, + "logps/chosen": -1104.0, + "logps/rejected": -1111.0, + "loss": 0.3397, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.2734375, + "rewards/margins": 18.6875, + "rewards/rejected": -16.3984375, + "step": 4800 + }, + { + "epoch": 0.9526266183838484, + "grad_norm": 22.626441311143054, + "learning_rate": 1.062023814588798e-07, + "logits/chosen": 4.5546875, + "logits/rejected": 4.32421875, + "logps/chosen": -1083.0, + "logps/rejected": -629.0, + "loss": 0.3997, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.640625, + "rewards/margins": 8.75, + "rewards/rejected": -6.09375, + "step": 4801 + }, + { + "epoch": 0.952825040924649, + "grad_norm": 29.64489339622017, + "learning_rate": 1.0615092045349365e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.26953125, + "logps/chosen": -848.5, + "logps/rejected": -703.0, + "loss": 0.53, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.57421875, + "rewards/margins": 6.62109375, + "rewards/rejected": -4.0390625, + "step": 4802 + }, + { + "epoch": 0.9530234634654496, + "grad_norm": 29.56519582781048, + "learning_rate": 1.0609967235462676e-07, + "logits/chosen": 3.8125, + "logits/rejected": 3.8203125, + "logps/chosen": -891.0, + "logps/rejected": -797.0, + "loss": 0.353, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6875, + "rewards/margins": 8.88671875, + "rewards/rejected": -6.1953125, + "step": 4803 + }, + { + "epoch": 0.9532218860062504, + "grad_norm": 29.61123506191941, + "learning_rate": 1.0604863718686188e-07, + "logits/chosen": 3.8828125, + "logits/rejected": 3.95703125, + "logps/chosen": -1340.0, + "logps/rejected": -803.0, + "loss": 0.3175, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.525390625, + "rewards/margins": 9.75, + "rewards/rejected": -6.23046875, + "step": 4804 + }, + { + "epoch": 0.953420308547051, + "grad_norm": 43.95339098072609, + "learning_rate": 1.0599781497467976e-07, + "logits/chosen": 3.875, + "logits/rejected": 3.6953125, + "logps/chosen": -813.0, + "logps/rejected": -559.0, + "loss": 0.4565, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.91015625, + "rewards/margins": 7.265625, + "rewards/rejected": -5.3515625, + "step": 4805 + }, + { + "epoch": 0.9536187310878516, + "grad_norm": 35.948314304214286, + "learning_rate": 1.0594720574245882e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.85546875, + "logps/chosen": -987.0, + "logps/rejected": -563.0, + "loss": 0.3065, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.22265625, + "rewards/margins": 8.7890625, + "rewards/rejected": -5.5703125, + "step": 4806 + }, + { + "epoch": 0.9538171536286523, + "grad_norm": 36.197398820194294, + "learning_rate": 1.0589680951447551e-07, + "logits/chosen": 3.703125, + "logits/rejected": 3.6484375, + "logps/chosen": -1081.0, + "logps/rejected": -733.0, + "loss": 0.3285, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7841796875, + "rewards/margins": 10.7734375, + "rewards/rejected": -8.0, + "step": 4807 + }, + { + "epoch": 0.9540155761694529, + "grad_norm": 30.28222492058419, + "learning_rate": 1.0584662631490388e-07, + "logits/chosen": 3.8359375, + "logits/rejected": 3.859375, + "logps/chosen": -914.0, + "logps/rejected": -770.0, + "loss": 0.4548, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.73828125, + "rewards/margins": 7.359375, + "rewards/rejected": -4.6328125, + "step": 4808 + }, + { + "epoch": 0.9542139987102535, + "grad_norm": 33.29134298700417, + "learning_rate": 1.0579665616781603e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 4.01953125, + "logps/chosen": -1306.0, + "logps/rejected": -921.0, + "loss": 0.3894, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.770263671875, + "rewards/margins": 9.7109375, + "rewards/rejected": -6.94140625, + "step": 4809 + }, + { + "epoch": 0.9544124212510541, + "grad_norm": 28.29951438882862, + "learning_rate": 1.0574689909718168e-07, + "logits/chosen": 4.171875, + "logits/rejected": 4.03125, + "logps/chosen": -1006.0, + "logps/rejected": -613.0, + "loss": 0.3862, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.14453125, + "rewards/margins": 8.171875, + "rewards/rejected": -5.0234375, + "step": 4810 + }, + { + "epoch": 0.9546108437918548, + "grad_norm": 25.018277381045646, + "learning_rate": 1.0569735512686846e-07, + "logits/chosen": 3.76953125, + "logits/rejected": 3.68359375, + "logps/chosen": -817.0, + "logps/rejected": -567.5, + "loss": 0.3596, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.85546875, + "rewards/margins": 7.65625, + "rewards/rejected": -4.80078125, + "step": 4811 + }, + { + "epoch": 0.9548092663326554, + "grad_norm": 40.70687600036639, + "learning_rate": 1.0564802428064174e-07, + "logits/chosen": 3.5703125, + "logits/rejected": 3.8984375, + "logps/chosen": -850.0, + "logps/rejected": -759.0, + "loss": 0.4352, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.01953125, + "rewards/margins": 7.078125, + "rewards/rejected": -5.05859375, + "step": 4812 + }, + { + "epoch": 0.955007688873456, + "grad_norm": 20.553466369995103, + "learning_rate": 1.0559890658216453e-07, + "logits/chosen": 4.640625, + "logits/rejected": 4.2421875, + "logps/chosen": -1181.0, + "logps/rejected": -1787.0, + "loss": 0.2864, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.83203125, + "rewards/margins": 13.421875, + "rewards/rejected": -10.58203125, + "step": 4813 + }, + { + "epoch": 0.9552061114142567, + "grad_norm": 43.676961524581074, + "learning_rate": 1.0555000205499796e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.19140625, + "logps/chosen": -1004.0, + "logps/rejected": -1256.5, + "loss": 0.4802, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.4140625, + "rewards/margins": 8.109375, + "rewards/rejected": -5.69921875, + "step": 4814 + }, + { + "epoch": 0.9554045339550573, + "grad_norm": 39.85079653556385, + "learning_rate": 1.0550131072260042e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 3.5703125, + "logps/chosen": -1138.0, + "logps/rejected": -618.5, + "loss": 0.2993, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.58203125, + "rewards/margins": 8.328125, + "rewards/rejected": -5.74609375, + "step": 4815 + }, + { + "epoch": 0.9556029564958579, + "grad_norm": 28.55680772390683, + "learning_rate": 1.0545283260832838e-07, + "logits/chosen": 4.19921875, + "logits/rejected": 4.12890625, + "logps/chosen": -1205.0, + "logps/rejected": -787.5, + "loss": 0.2523, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.21875, + "rewards/margins": 9.421875, + "rewards/rejected": -6.203125, + "step": 4816 + }, + { + "epoch": 0.9558013790366585, + "grad_norm": 22.839352333461555, + "learning_rate": 1.0540456773543595e-07, + "logits/chosen": 4.609375, + "logits/rejected": 4.828125, + "logps/chosen": -828.5, + "logps/rejected": -763.5, + "loss": 0.4264, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.806640625, + "rewards/margins": 11.484375, + "rewards/rejected": -9.6640625, + "step": 4817 + }, + { + "epoch": 0.9559998015774592, + "grad_norm": 25.27585597270871, + "learning_rate": 1.053565161270749e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 4.078125, + "logps/chosen": -845.0, + "logps/rejected": -698.0, + "loss": 0.4131, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.78515625, + "rewards/margins": 8.1796875, + "rewards/rejected": -5.390625, + "step": 4818 + }, + { + "epoch": 0.9561982241182598, + "grad_norm": 26.649167998853322, + "learning_rate": 1.0530867780629479e-07, + "logits/chosen": 3.75, + "logits/rejected": 3.7265625, + "logps/chosen": -1104.0, + "logps/rejected": -696.0, + "loss": 0.2782, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.4453125, + "rewards/margins": 9.2578125, + "rewards/rejected": -5.8125, + "step": 4819 + }, + { + "epoch": 0.9563966466590604, + "grad_norm": 27.305217561466645, + "learning_rate": 1.0526105279604268e-07, + "logits/chosen": 4.578125, + "logits/rejected": 4.34375, + "logps/chosen": -1186.0, + "logps/rejected": -795.0, + "loss": 0.3112, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.69140625, + "rewards/margins": 9.1640625, + "rewards/rejected": -5.46484375, + "step": 4820 + }, + { + "epoch": 0.9565950691998611, + "grad_norm": 26.989470653442183, + "learning_rate": 1.0521364111916353e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.70703125, + "logps/chosen": -701.0, + "logps/rejected": -600.5, + "loss": 0.3887, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7890625, + "rewards/margins": 8.359375, + "rewards/rejected": -5.578125, + "step": 4821 + }, + { + "epoch": 0.9567934917406618, + "grad_norm": 36.731643587536524, + "learning_rate": 1.0516644279839992e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.078125, + "logps/chosen": -1258.0, + "logps/rejected": -650.0, + "loss": 0.3144, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.46484375, + "rewards/margins": 9.6875, + "rewards/rejected": -6.21875, + "step": 4822 + }, + { + "epoch": 0.9569919142814624, + "grad_norm": 22.833616842695665, + "learning_rate": 1.0511945785639197e-07, + "logits/chosen": 3.62890625, + "logits/rejected": 3.74609375, + "logps/chosen": -1278.0, + "logps/rejected": -734.0, + "loss": 0.313, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.07421875, + "rewards/margins": 10.015625, + "rewards/rejected": -6.9453125, + "step": 4823 + }, + { + "epoch": 0.957190336822263, + "grad_norm": 30.811457426265168, + "learning_rate": 1.0507268631567759e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.96484375, + "logps/chosen": -914.0, + "logps/rejected": -684.0, + "loss": 0.3378, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.21875, + "rewards/margins": 8.0, + "rewards/rejected": -5.765625, + "step": 4824 + }, + { + "epoch": 0.9573887593630637, + "grad_norm": 31.351985169765967, + "learning_rate": 1.0502612819869216e-07, + "logits/chosen": 3.76171875, + "logits/rejected": 3.7578125, + "logps/chosen": -902.0, + "logps/rejected": -858.0, + "loss": 0.3585, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.63671875, + "rewards/margins": 8.765625, + "rewards/rejected": -6.1328125, + "step": 4825 + }, + { + "epoch": 0.9575871819038643, + "grad_norm": 19.60753226144002, + "learning_rate": 1.0497978352776886e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 4.1328125, + "logps/chosen": -904.0, + "logps/rejected": -688.0, + "loss": 0.2704, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.72021484375, + "rewards/margins": 9.828125, + "rewards/rejected": -7.1015625, + "step": 4826 + }, + { + "epoch": 0.9577856044446649, + "grad_norm": 22.523892380378655, + "learning_rate": 1.049336523251384e-07, + "logits/chosen": 4.515625, + "logits/rejected": 4.640625, + "logps/chosen": -836.5, + "logps/rejected": -1282.0, + "loss": 0.2794, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.22265625, + "rewards/margins": 11.640625, + "rewards/rejected": -8.44140625, + "step": 4827 + }, + { + "epoch": 0.9579840269854656, + "grad_norm": 34.85006241792323, + "learning_rate": 1.0488773461292908e-07, + "logits/chosen": 4.3515625, + "logits/rejected": 4.1640625, + "logps/chosen": -1145.0, + "logps/rejected": -845.0, + "loss": 0.3134, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.824462890625, + "rewards/margins": 8.859375, + "rewards/rejected": -7.046875, + "step": 4828 + }, + { + "epoch": 0.9581824495262662, + "grad_norm": 38.04754024196629, + "learning_rate": 1.0484203041316677e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.2421875, + "logps/chosen": -1235.0, + "logps/rejected": -885.0, + "loss": 0.5082, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.2578125, + "rewards/margins": 7.0, + "rewards/rejected": -3.7353515625, + "step": 4829 + }, + { + "epoch": 0.9583808720670668, + "grad_norm": 26.292520668099083, + "learning_rate": 1.0479653974777506e-07, + "logits/chosen": 3.98828125, + "logits/rejected": 3.90625, + "logps/chosen": -1170.5, + "logps/rejected": -756.0, + "loss": 0.3201, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2900390625, + "rewards/margins": 9.0703125, + "rewards/rejected": -5.78125, + "step": 4830 + }, + { + "epoch": 0.9585792946078675, + "grad_norm": 32.09325929011415, + "learning_rate": 1.04751262638575e-07, + "logits/chosen": 3.59765625, + "logits/rejected": 3.59375, + "logps/chosen": -977.0, + "logps/rejected": -831.0, + "loss": 0.3452, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7646484375, + "rewards/margins": 8.703125, + "rewards/rejected": -5.9453125, + "step": 4831 + }, + { + "epoch": 0.9587777171486681, + "grad_norm": 32.616930070993185, + "learning_rate": 1.0470619910728512e-07, + "logits/chosen": 3.94140625, + "logits/rejected": 4.46875, + "logps/chosen": -1003.0, + "logps/rejected": -1194.0, + "loss": 0.3694, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.41796875, + "rewards/margins": 11.1015625, + "rewards/rejected": -8.6796875, + "step": 4832 + }, + { + "epoch": 0.9589761396894687, + "grad_norm": 26.53052752591872, + "learning_rate": 1.0466134917552161e-07, + "logits/chosen": 4.515625, + "logits/rejected": 4.453125, + "logps/chosen": -887.0, + "logps/rejected": -738.0, + "loss": 0.4812, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5390625, + "rewards/margins": 7.84375, + "rewards/rejected": -5.296875, + "step": 4833 + }, + { + "epoch": 0.9591745622302693, + "grad_norm": 25.339900713949497, + "learning_rate": 1.0461671286479832e-07, + "logits/chosen": 4.1015625, + "logits/rejected": 4.27734375, + "logps/chosen": -957.0, + "logps/rejected": -1732.5, + "loss": 0.2734, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.5078125, + "rewards/margins": 12.5078125, + "rewards/rejected": -8.9921875, + "step": 4834 + }, + { + "epoch": 0.95937298477107, + "grad_norm": 38.233802902911115, + "learning_rate": 1.0457229019652633e-07, + "logits/chosen": 4.44921875, + "logits/rejected": 4.3671875, + "logps/chosen": -1002.0, + "logps/rejected": -1050.0, + "loss": 0.3721, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8359375, + "rewards/margins": 10.515625, + "rewards/rejected": -7.67578125, + "step": 4835 + }, + { + "epoch": 0.9595714073118706, + "grad_norm": 24.128437530944677, + "learning_rate": 1.045280811920145e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 4.10546875, + "logps/chosen": -1033.0, + "logps/rejected": -686.5, + "loss": 0.4448, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.19140625, + "rewards/margins": 7.46875, + "rewards/rejected": -4.28125, + "step": 4836 + }, + { + "epoch": 0.9597698298526712, + "grad_norm": 27.71505092298223, + "learning_rate": 1.0448408587246914e-07, + "logits/chosen": 3.52734375, + "logits/rejected": 3.85546875, + "logps/chosen": -989.0, + "logps/rejected": -923.0, + "loss": 0.3111, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.548828125, + "rewards/margins": 10.34375, + "rewards/rejected": -7.796875, + "step": 4837 + }, + { + "epoch": 0.9599682523934719, + "grad_norm": 26.06526830660151, + "learning_rate": 1.0444030425899385e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 4.34375, + "logps/chosen": -1718.5, + "logps/rejected": -830.5, + "loss": 0.4225, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.7421875, + "rewards/margins": 6.860107421875, + "rewards/rejected": -5.109375, + "step": 4838 + }, + { + "epoch": 0.9601666749342725, + "grad_norm": 35.7469336690152, + "learning_rate": 1.0439673637259013e-07, + "logits/chosen": 3.97265625, + "logits/rejected": 3.8359375, + "logps/chosen": -978.0, + "logps/rejected": -710.0, + "loss": 0.3974, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9375, + "rewards/margins": 8.5234375, + "rewards/rejected": -5.58984375, + "step": 4839 + }, + { + "epoch": 0.9603650974750731, + "grad_norm": 29.425990151911346, + "learning_rate": 1.0435338223415652e-07, + "logits/chosen": 4.640625, + "logits/rejected": 4.65234375, + "logps/chosen": -1143.0, + "logps/rejected": -777.0, + "loss": 0.4013, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.90234375, + "rewards/margins": 8.1484375, + "rewards/rejected": -5.2421875, + "step": 4840 + }, + { + "epoch": 0.9605635200158738, + "grad_norm": 32.85987042179207, + "learning_rate": 1.0431024186448934e-07, + "logits/chosen": 3.95703125, + "logits/rejected": 3.640625, + "logps/chosen": -918.0, + "logps/rejected": -574.5, + "loss": 0.4884, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.55078125, + "rewards/margins": 6.796875, + "rewards/rejected": -4.24755859375, + "step": 4841 + }, + { + "epoch": 0.9607619425566745, + "grad_norm": 34.221580272152934, + "learning_rate": 1.042673152842822e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 4.30859375, + "logps/chosen": -876.0, + "logps/rejected": -641.5, + "loss": 0.4359, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.181640625, + "rewards/margins": 8.140625, + "rewards/rejected": -5.953125, + "step": 4842 + }, + { + "epoch": 0.9609603650974751, + "grad_norm": 25.387784578278403, + "learning_rate": 1.0422460251412624e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 3.84375, + "logps/chosen": -1344.0, + "logps/rejected": -803.0, + "loss": 0.4214, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.001953125, + "rewards/margins": 7.19921875, + "rewards/rejected": -4.1875, + "step": 4843 + }, + { + "epoch": 0.9611587876382757, + "grad_norm": 35.19011420882994, + "learning_rate": 1.0418210357451009e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 4.1015625, + "logps/chosen": -1269.0, + "logps/rejected": -825.0, + "loss": 0.532, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.455078125, + "rewards/margins": 6.90234375, + "rewards/rejected": -4.44921875, + "step": 4844 + }, + { + "epoch": 0.9613572101790764, + "grad_norm": 27.29188605455442, + "learning_rate": 1.041398184858196e-07, + "logits/chosen": 4.2578125, + "logits/rejected": 4.13671875, + "logps/chosen": -962.0, + "logps/rejected": -802.0, + "loss": 0.3024, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.259765625, + "rewards/margins": 10.0, + "rewards/rejected": -6.71875, + "step": 4845 + }, + { + "epoch": 0.961555632719877, + "grad_norm": 26.80840173792393, + "learning_rate": 1.0409774726833832e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.25390625, + "logps/chosen": -932.0, + "logps/rejected": -678.5, + "loss": 0.4148, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.669921875, + "rewards/margins": 7.1484375, + "rewards/rejected": -4.48046875, + "step": 4846 + }, + { + "epoch": 0.9617540552606776, + "grad_norm": 29.873977723637847, + "learning_rate": 1.0405588994224692e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 3.7734375, + "logps/chosen": -747.0, + "logps/rejected": -785.5, + "loss": 0.4063, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.390625, + "rewards/margins": 15.5703125, + "rewards/rejected": -13.21875, + "step": 4847 + }, + { + "epoch": 0.9619524778014783, + "grad_norm": 38.23963298506707, + "learning_rate": 1.0401424652762368e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 3.89453125, + "logps/chosen": -895.5, + "logps/rejected": -468.0, + "loss": 0.5209, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5234375, + "rewards/margins": 5.80078125, + "rewards/rejected": -3.279296875, + "step": 4848 + }, + { + "epoch": 0.9621509003422789, + "grad_norm": 27.007171473447908, + "learning_rate": 1.039728170444443e-07, + "logits/chosen": 3.6015625, + "logits/rejected": 3.48046875, + "logps/chosen": -795.0, + "logps/rejected": -485.0, + "loss": 0.3877, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.765625, + "rewards/margins": 7.2265625, + "rewards/rejected": -4.4609375, + "step": 4849 + }, + { + "epoch": 0.9623493228830795, + "grad_norm": 29.97552815272199, + "learning_rate": 1.0393160151258157e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 3.8359375, + "logps/chosen": -1120.0, + "logps/rejected": -1480.5, + "loss": 0.3604, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.79296875, + "rewards/margins": 14.28125, + "rewards/rejected": -11.4765625, + "step": 4850 + }, + { + "epoch": 0.9625477454238801, + "grad_norm": 23.624839687034378, + "learning_rate": 1.0389059995180604e-07, + "logits/chosen": 4.0546875, + "logits/rejected": 4.0, + "logps/chosen": -1420.0, + "logps/rejected": -754.0, + "loss": 0.2809, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.203125, + "rewards/margins": 9.203125, + "rewards/rejected": -6.0, + "step": 4851 + }, + { + "epoch": 0.9627461679646808, + "grad_norm": 24.501703435831484, + "learning_rate": 1.0384981238178533e-07, + "logits/chosen": 3.82421875, + "logits/rejected": 4.01171875, + "logps/chosen": -933.0, + "logps/rejected": -585.0, + "loss": 0.5125, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.611328125, + "rewards/margins": 6.71875, + "rewards/rejected": -5.11328125, + "step": 4852 + }, + { + "epoch": 0.9629445905054814, + "grad_norm": 30.752169603176327, + "learning_rate": 1.0380923882208453e-07, + "logits/chosen": 3.59375, + "logits/rejected": 4.046875, + "logps/chosen": -897.5, + "logps/rejected": -682.0, + "loss": 0.4363, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.5, + "rewards/margins": 6.84375, + "rewards/rejected": -4.33984375, + "step": 4853 + }, + { + "epoch": 0.963143013046282, + "grad_norm": 26.9003911224651, + "learning_rate": 1.0376887929216603e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.03515625, + "logps/chosen": -1194.0, + "logps/rejected": -813.0, + "loss": 0.3721, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0859375, + "rewards/margins": 7.6484375, + "rewards/rejected": -4.5625, + "step": 4854 + }, + { + "epoch": 0.9633414355870827, + "grad_norm": 29.912191771818126, + "learning_rate": 1.0372873381138965e-07, + "logits/chosen": 3.96875, + "logits/rejected": 4.125, + "logps/chosen": -1121.0, + "logps/rejected": -1724.0, + "loss": 0.3281, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.23828125, + "rewards/margins": 11.65625, + "rewards/rejected": -8.419921875, + "step": 4855 + }, + { + "epoch": 0.9635398581278833, + "grad_norm": 35.58964566692345, + "learning_rate": 1.0368880239901241e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 3.78515625, + "logps/chosen": -1068.0, + "logps/rejected": -725.0, + "loss": 0.3171, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6015625, + "rewards/margins": 8.6640625, + "rewards/rejected": -6.0859375, + "step": 4856 + }, + { + "epoch": 0.9637382806686839, + "grad_norm": 27.41421436516712, + "learning_rate": 1.0364908507418873e-07, + "logits/chosen": 4.03125, + "logits/rejected": 3.94140625, + "logps/chosen": -984.0, + "logps/rejected": -619.0, + "loss": 0.3682, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.603515625, + "rewards/margins": 7.9375, + "rewards/rejected": -5.33984375, + "step": 4857 + }, + { + "epoch": 0.9639367032094845, + "grad_norm": 23.43871547719219, + "learning_rate": 1.0360958185597022e-07, + "logits/chosen": 4.4453125, + "logits/rejected": 4.453125, + "logps/chosen": -1035.0, + "logps/rejected": -852.0, + "loss": 0.3126, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.30859375, + "rewards/margins": 9.5, + "rewards/rejected": -6.1796875, + "step": 4858 + }, + { + "epoch": 0.9641351257502853, + "grad_norm": 33.11848577082329, + "learning_rate": 1.0357029276330594e-07, + "logits/chosen": 4.25, + "logits/rejected": 4.2890625, + "logps/chosen": -891.0, + "logps/rejected": -720.0, + "loss": 0.3084, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.984375, + "rewards/margins": 8.0859375, + "rewards/rejected": -5.11328125, + "step": 4859 + }, + { + "epoch": 0.9643335482910859, + "grad_norm": 29.418563339388168, + "learning_rate": 1.0353121781504226e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.359375, + "logps/chosen": -1137.5, + "logps/rejected": -852.0, + "loss": 0.3415, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6171875, + "rewards/margins": 10.0703125, + "rewards/rejected": -7.453125, + "step": 4860 + }, + { + "epoch": 0.9645319708318865, + "grad_norm": 33.43860931869727, + "learning_rate": 1.0349235702992257e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.10546875, + "logps/chosen": -884.0, + "logps/rejected": -717.0, + "loss": 0.3706, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.62890625, + "rewards/margins": 7.8828125, + "rewards/rejected": -5.25, + "step": 4861 + }, + { + "epoch": 0.9647303933726872, + "grad_norm": 31.848924080822187, + "learning_rate": 1.0345371042658779e-07, + "logits/chosen": 4.05859375, + "logits/rejected": 3.9609375, + "logps/chosen": -1076.0, + "logps/rejected": -839.0, + "loss": 0.3873, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.181640625, + "rewards/margins": 8.7109375, + "rewards/rejected": -6.52734375, + "step": 4862 + }, + { + "epoch": 0.9649288159134878, + "grad_norm": 31.270451905427162, + "learning_rate": 1.03415278023576e-07, + "logits/chosen": 4.5, + "logits/rejected": 4.28515625, + "logps/chosen": -864.0, + "logps/rejected": -782.0, + "loss": 0.4196, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.60546875, + "rewards/margins": 8.2421875, + "rewards/rejected": -5.640625, + "step": 4863 + }, + { + "epoch": 0.9651272384542884, + "grad_norm": 28.803918349024933, + "learning_rate": 1.0337705983932254e-07, + "logits/chosen": 4.30859375, + "logits/rejected": 4.140625, + "logps/chosen": -835.0, + "logps/rejected": -709.0, + "loss": 0.3572, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.689453125, + "rewards/margins": 9.4140625, + "rewards/rejected": -6.7265625, + "step": 4864 + }, + { + "epoch": 0.9653256609950891, + "grad_norm": 30.71722487161524, + "learning_rate": 1.0333905589215999e-07, + "logits/chosen": 3.828125, + "logits/rejected": 3.96484375, + "logps/chosen": -1208.0, + "logps/rejected": -790.0, + "loss": 0.4563, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.720703125, + "rewards/margins": 7.4453125, + "rewards/rejected": -4.71875, + "step": 4865 + }, + { + "epoch": 0.9655240835358897, + "grad_norm": 33.476443730621845, + "learning_rate": 1.0330126620031818e-07, + "logits/chosen": 4.140625, + "logits/rejected": 4.0078125, + "logps/chosen": -1370.0, + "logps/rejected": -809.0, + "loss": 0.3176, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.525390625, + "rewards/margins": 10.484375, + "rewards/rejected": -6.96875, + "step": 4866 + }, + { + "epoch": 0.9657225060766903, + "grad_norm": 30.9808982624866, + "learning_rate": 1.0326369078192411e-07, + "logits/chosen": 3.78125, + "logits/rejected": 3.79296875, + "logps/chosen": -800.0, + "logps/rejected": -781.5, + "loss": 0.3811, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.314453125, + "rewards/margins": 8.671875, + "rewards/rejected": -6.359375, + "step": 4867 + }, + { + "epoch": 0.9659209286174909, + "grad_norm": 30.8936574473459, + "learning_rate": 1.032263296550021e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 3.859375, + "logps/chosen": -1140.0, + "logps/rejected": -778.0, + "loss": 0.2878, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.126953125, + "rewards/margins": 9.0078125, + "rewards/rejected": -5.8828125, + "step": 4868 + }, + { + "epoch": 0.9661193511582916, + "grad_norm": 30.498907222794198, + "learning_rate": 1.0318918283747358e-07, + "logits/chosen": 3.52734375, + "logits/rejected": 3.73828125, + "logps/chosen": -848.0, + "logps/rejected": -693.5, + "loss": 0.4494, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.8232421875, + "rewards/margins": 9.0, + "rewards/rejected": -7.1796875, + "step": 4869 + }, + { + "epoch": 0.9663177736990922, + "grad_norm": 27.602020396732083, + "learning_rate": 1.0315225034715717e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 3.9765625, + "logps/chosen": -912.5, + "logps/rejected": -1350.0, + "loss": 0.3949, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.248046875, + "rewards/margins": 11.21875, + "rewards/rejected": -7.9765625, + "step": 4870 + }, + { + "epoch": 0.9665161962398928, + "grad_norm": 36.80822674450793, + "learning_rate": 1.0311553220176889e-07, + "logits/chosen": 4.265625, + "logits/rejected": 4.2890625, + "logps/chosen": -863.0, + "logps/rejected": -741.5, + "loss": 0.3811, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.173828125, + "rewards/margins": 9.8359375, + "rewards/rejected": -7.6484375, + "step": 4871 + }, + { + "epoch": 0.9667146187806935, + "grad_norm": 20.99979151980142, + "learning_rate": 1.030790284189216e-07, + "logits/chosen": 3.953125, + "logits/rejected": 4.328125, + "logps/chosen": -1148.0, + "logps/rejected": -1597.0, + "loss": 0.3915, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.400390625, + "rewards/margins": 13.46875, + "rewards/rejected": -10.1015625, + "step": 4872 + }, + { + "epoch": 0.9669130413214941, + "grad_norm": 24.186242366559384, + "learning_rate": 1.0304273901612565e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.328125, + "logps/chosen": -1185.0, + "logps/rejected": -845.0, + "loss": 0.5171, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.458984375, + "rewards/margins": 7.7890625, + "rewards/rejected": -5.3203125, + "step": 4873 + }, + { + "epoch": 0.9671114638622947, + "grad_norm": 31.35417143357748, + "learning_rate": 1.0300666401078836e-07, + "logits/chosen": 4.16015625, + "logits/rejected": 4.203125, + "logps/chosen": -875.5, + "logps/rejected": -627.0, + "loss": 0.3907, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4384765625, + "rewards/margins": 6.875, + "rewards/rejected": -4.44921875, + "step": 4874 + }, + { + "epoch": 0.9673098864030953, + "grad_norm": 31.62933499742104, + "learning_rate": 1.0297080342021428e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 3.859375, + "logps/chosen": -899.0, + "logps/rejected": -574.0, + "loss": 0.3088, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.421875, + "rewards/margins": 8.375, + "rewards/rejected": -5.9453125, + "step": 4875 + }, + { + "epoch": 0.9675083089438961, + "grad_norm": 22.145534902672203, + "learning_rate": 1.0293515726160508e-07, + "logits/chosen": 4.40625, + "logits/rejected": 4.27734375, + "logps/chosen": -900.0, + "logps/rejected": -766.0, + "loss": 0.4093, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.93359375, + "rewards/margins": 12.40625, + "rewards/rejected": -9.4375, + "step": 4876 + }, + { + "epoch": 0.9677067314846967, + "grad_norm": 33.54832708309929, + "learning_rate": 1.0289972555205964e-07, + "logits/chosen": 3.5078125, + "logits/rejected": 3.671875, + "logps/chosen": -832.0, + "logps/rejected": -877.0, + "loss": 0.3859, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.994140625, + "rewards/margins": 8.1171875, + "rewards/rejected": -6.1171875, + "step": 4877 + }, + { + "epoch": 0.9679051540254973, + "grad_norm": 27.12014234252479, + "learning_rate": 1.0286450830857387e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 4.05078125, + "logps/chosen": -1168.0, + "logps/rejected": -776.0, + "loss": 0.5009, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.5, + "rewards/margins": 6.40234375, + "rewards/rejected": -3.908203125, + "step": 4878 + }, + { + "epoch": 0.968103576566298, + "grad_norm": 22.904935446433818, + "learning_rate": 1.0282950554804083e-07, + "logits/chosen": 3.484375, + "logits/rejected": 3.59375, + "logps/chosen": -1129.0, + "logps/rejected": -898.0, + "loss": 0.3204, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.6484375, + "rewards/margins": 9.6171875, + "rewards/rejected": -5.98046875, + "step": 4879 + }, + { + "epoch": 0.9683019991070986, + "grad_norm": 31.98089077261743, + "learning_rate": 1.0279471728725081e-07, + "logits/chosen": 4.0078125, + "logits/rejected": 4.3515625, + "logps/chosen": -1055.0, + "logps/rejected": -1007.5, + "loss": 0.4217, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.466796875, + "rewards/margins": 7.9921875, + "rewards/rejected": -5.53515625, + "step": 4880 + }, + { + "epoch": 0.9685004216478992, + "grad_norm": 32.484734136057334, + "learning_rate": 1.0276014354289107e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 3.76953125, + "logps/chosen": -1046.0, + "logps/rejected": -590.5, + "loss": 0.4393, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.227294921875, + "rewards/margins": 6.58203125, + "rewards/rejected": -4.359375, + "step": 4881 + }, + { + "epoch": 0.9686988441886998, + "grad_norm": 24.27701901207057, + "learning_rate": 1.0272578433154602e-07, + "logits/chosen": 3.73828125, + "logits/rejected": 3.5078125, + "logps/chosen": -1049.0, + "logps/rejected": -743.5, + "loss": 0.2872, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.83984375, + "rewards/margins": 10.109375, + "rewards/rejected": -7.265625, + "step": 4882 + }, + { + "epoch": 0.9688972667295005, + "grad_norm": 32.01061018450412, + "learning_rate": 1.0269163966969721e-07, + "logits/chosen": 3.78515625, + "logits/rejected": 3.98046875, + "logps/chosen": -782.0, + "logps/rejected": -445.5, + "loss": 0.481, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.5703125, + "rewards/margins": 6.837890625, + "rewards/rejected": -5.26953125, + "step": 4883 + }, + { + "epoch": 0.9690956892703011, + "grad_norm": 33.15666404837725, + "learning_rate": 1.0265770957372315e-07, + "logits/chosen": 3.8671875, + "logits/rejected": 3.66015625, + "logps/chosen": -973.0, + "logps/rejected": -880.0, + "loss": 0.2779, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.421875, + "rewards/margins": 10.359375, + "rewards/rejected": -6.9609375, + "step": 4884 + }, + { + "epoch": 0.9692941118111017, + "grad_norm": 24.90454525718406, + "learning_rate": 1.0262399405989955e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 4.296875, + "logps/chosen": -1262.0, + "logps/rejected": -1964.0, + "loss": 0.2501, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.88671875, + "rewards/margins": 14.71875, + "rewards/rejected": -11.859375, + "step": 4885 + }, + { + "epoch": 0.9694925343519024, + "grad_norm": 32.695803176742224, + "learning_rate": 1.0259049314439914e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 4.1875, + "logps/chosen": -973.0, + "logps/rejected": -790.0, + "loss": 0.416, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.87890625, + "rewards/margins": 8.6484375, + "rewards/rejected": -5.765625, + "step": 4886 + }, + { + "epoch": 0.969690956892703, + "grad_norm": 42.74601963585623, + "learning_rate": 1.0255720684329171e-07, + "logits/chosen": 3.625, + "logits/rejected": 3.734375, + "logps/chosen": -749.0, + "logps/rejected": -749.0, + "loss": 0.4456, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.4755859375, + "rewards/margins": 6.5078125, + "rewards/rejected": -5.0390625, + "step": 4887 + }, + { + "epoch": 0.9698893794335036, + "grad_norm": 32.686384362424526, + "learning_rate": 1.025241351725441e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 4.1328125, + "logps/chosen": -955.0, + "logps/rejected": -676.0, + "loss": 0.5504, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.32421875, + "rewards/margins": 7.0, + "rewards/rejected": -4.66796875, + "step": 4888 + }, + { + "epoch": 0.9700878019743043, + "grad_norm": 37.79299756351384, + "learning_rate": 1.0249127814802026e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 4.2890625, + "logps/chosen": -1068.0, + "logps/rejected": -838.0, + "loss": 0.3954, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.619140625, + "rewards/margins": 8.9375, + "rewards/rejected": -6.31640625, + "step": 4889 + }, + { + "epoch": 0.9702862245151049, + "grad_norm": 29.656384405250982, + "learning_rate": 1.0245863578548097e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 3.99609375, + "logps/chosen": -1140.0, + "logps/rejected": -648.0, + "loss": 0.3407, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.892578125, + "rewards/margins": 8.421875, + "rewards/rejected": -5.5390625, + "step": 4890 + }, + { + "epoch": 0.9704846470559055, + "grad_norm": 24.007193700523796, + "learning_rate": 1.0242620810058438e-07, + "logits/chosen": 4.09375, + "logits/rejected": 3.9453125, + "logps/chosen": -773.0, + "logps/rejected": -659.0, + "loss": 0.3051, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.98046875, + "rewards/margins": 8.640625, + "rewards/rejected": -5.6875, + "step": 4891 + }, + { + "epoch": 0.9706830695967061, + "grad_norm": 32.783021776243274, + "learning_rate": 1.0239399510888539e-07, + "logits/chosen": 3.71484375, + "logits/rejected": 3.25, + "logps/chosen": -1017.5, + "logps/rejected": -674.0, + "loss": 0.3489, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.400390625, + "rewards/margins": 8.671875, + "rewards/rejected": -6.26953125, + "step": 4892 + }, + { + "epoch": 0.9708814921375069, + "grad_norm": 34.66678336885407, + "learning_rate": 1.0236199682583596e-07, + "logits/chosen": 4.31640625, + "logits/rejected": 4.296875, + "logps/chosen": -630.0, + "logps/rejected": -467.5, + "loss": 0.3711, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.857421875, + "rewards/margins": 6.9296875, + "rewards/rejected": -5.0703125, + "step": 4893 + }, + { + "epoch": 0.9710799146783075, + "grad_norm": 37.031609894254565, + "learning_rate": 1.0233021326678516e-07, + "logits/chosen": 3.8515625, + "logits/rejected": 3.88671875, + "logps/chosen": -859.0, + "logps/rejected": -644.5, + "loss": 0.5369, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.55859375, + "rewards/margins": 6.96484375, + "rewards/rejected": -4.396484375, + "step": 4894 + }, + { + "epoch": 0.9712783372191081, + "grad_norm": 44.16152033247856, + "learning_rate": 1.022986444469789e-07, + "logits/chosen": 4.359375, + "logits/rejected": 4.203125, + "logps/chosen": -1064.0, + "logps/rejected": -1114.0, + "loss": 0.3398, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.302734375, + "rewards/margins": 14.546875, + "rewards/rejected": -12.21875, + "step": 4895 + }, + { + "epoch": 0.9714767597599088, + "grad_norm": 22.339420502844014, + "learning_rate": 1.0226729038156036e-07, + "logits/chosen": 4.22265625, + "logits/rejected": 4.28515625, + "logps/chosen": -1360.0, + "logps/rejected": -997.0, + "loss": 0.216, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.07421875, + "rewards/margins": 12.546875, + "rewards/rejected": -8.4609375, + "step": 4896 + }, + { + "epoch": 0.9716751823007094, + "grad_norm": 27.662357837123032, + "learning_rate": 1.0223615108556937e-07, + "logits/chosen": 4.0625, + "logits/rejected": 3.6796875, + "logps/chosen": -1130.0, + "logps/rejected": -646.5, + "loss": 0.2301, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.6015625, + "rewards/margins": 10.703125, + "rewards/rejected": -7.109375, + "step": 4897 + }, + { + "epoch": 0.97187360484151, + "grad_norm": 25.992072960933726, + "learning_rate": 1.0220522657394297e-07, + "logits/chosen": 3.72265625, + "logits/rejected": 3.6171875, + "logps/chosen": -1233.0, + "logps/rejected": -852.0, + "loss": 0.2796, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.33203125, + "rewards/margins": 9.28125, + "rewards/rejected": -5.9609375, + "step": 4898 + }, + { + "epoch": 0.9720720273823106, + "grad_norm": 33.27279297450607, + "learning_rate": 1.021745168615151e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 4.04296875, + "logps/chosen": -1362.0, + "logps/rejected": -1242.0, + "loss": 0.2603, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.12890625, + "rewards/margins": 17.46875, + "rewards/rejected": -14.3125, + "step": 4899 + }, + { + "epoch": 0.9722704499231113, + "grad_norm": 33.467218046593985, + "learning_rate": 1.0214402196301656e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 3.9375, + "logps/chosen": -1312.0, + "logps/rejected": -1486.0, + "loss": 0.3768, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4892578125, + "rewards/margins": 12.0625, + "rewards/rejected": -9.5625, + "step": 4900 + }, + { + "epoch": 0.9724688724639119, + "grad_norm": 30.433086550656977, + "learning_rate": 1.0211374189307538e-07, + "logits/chosen": 4.1328125, + "logits/rejected": 3.78125, + "logps/chosen": -963.0, + "logps/rejected": -845.5, + "loss": 0.3623, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.2734375, + "rewards/margins": 9.546875, + "rewards/rejected": -7.265625, + "step": 4901 + }, + { + "epoch": 0.9726672950047125, + "grad_norm": 32.06583610069632, + "learning_rate": 1.0208367666621627e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 3.75390625, + "logps/chosen": -1283.0, + "logps/rejected": -907.0, + "loss": 0.3738, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.626953125, + "rewards/margins": 8.71875, + "rewards/rejected": -6.109375, + "step": 4902 + }, + { + "epoch": 0.9728657175455132, + "grad_norm": 30.2954027031263, + "learning_rate": 1.0205382629686099e-07, + "logits/chosen": 3.9375, + "logits/rejected": 3.94140625, + "logps/chosen": -1018.0, + "logps/rejected": -634.0, + "loss": 0.3233, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6484375, + "rewards/margins": 8.6484375, + "rewards/rejected": -6.0, + "step": 4903 + }, + { + "epoch": 0.9730641400863138, + "grad_norm": 27.566642160240654, + "learning_rate": 1.0202419079932825e-07, + "logits/chosen": 3.6015625, + "logits/rejected": 3.69921875, + "logps/chosen": -1370.0, + "logps/rejected": -837.5, + "loss": 0.3657, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.16015625, + "rewards/margins": 10.109375, + "rewards/rejected": -6.953125, + "step": 4904 + }, + { + "epoch": 0.9732625626271144, + "grad_norm": 57.1615653389739, + "learning_rate": 1.0199477018783369e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 3.55078125, + "logps/chosen": -649.0, + "logps/rejected": -642.0, + "loss": 0.6147, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.6298828125, + "rewards/margins": 6.17578125, + "rewards/rejected": -5.546875, + "step": 4905 + }, + { + "epoch": 0.9734609851679151, + "grad_norm": 32.6286484373654, + "learning_rate": 1.0196556447648978e-07, + "logits/chosen": 4.08203125, + "logits/rejected": 3.9375, + "logps/chosen": -848.0, + "logps/rejected": -704.5, + "loss": 0.5015, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.107421875, + "rewards/margins": 11.1171875, + "rewards/rejected": -9.0234375, + "step": 4906 + }, + { + "epoch": 0.9736594077087157, + "grad_norm": 30.6614844447454, + "learning_rate": 1.0193657367930606e-07, + "logits/chosen": 4.26953125, + "logits/rejected": 4.0625, + "logps/chosen": -923.0, + "logps/rejected": -1197.0, + "loss": 0.3991, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.15234375, + "rewards/margins": 10.71875, + "rewards/rejected": -7.5859375, + "step": 4907 + }, + { + "epoch": 0.9738578302495163, + "grad_norm": 32.23551214742731, + "learning_rate": 1.0190779781018886e-07, + "logits/chosen": 3.9609375, + "logits/rejected": 3.53515625, + "logps/chosen": -943.0, + "logps/rejected": -528.0, + "loss": 0.3647, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.33984375, + "rewards/margins": 6.859375, + "rewards/rejected": -4.51953125, + "step": 4908 + }, + { + "epoch": 0.9740562527903169, + "grad_norm": 24.75684976255492, + "learning_rate": 1.0187923688294147e-07, + "logits/chosen": 4.41796875, + "logits/rejected": 4.375, + "logps/chosen": -1013.0, + "logps/rejected": -883.0, + "loss": 0.4068, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5234375, + "rewards/margins": 8.0078125, + "rewards/rejected": -5.48046875, + "step": 4909 + }, + { + "epoch": 0.9742546753311176, + "grad_norm": 28.030881280276482, + "learning_rate": 1.0185089091126409e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.421875, + "logps/chosen": -1292.0, + "logps/rejected": -995.0, + "loss": 0.4215, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.98046875, + "rewards/margins": 8.984375, + "rewards/rejected": -6.015625, + "step": 4910 + }, + { + "epoch": 0.9744530978719183, + "grad_norm": 31.727336423698343, + "learning_rate": 1.0182275990875372e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 4.25, + "logps/chosen": -1268.0, + "logps/rejected": -934.0, + "loss": 0.422, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.85546875, + "rewards/margins": 7.5625, + "rewards/rejected": -4.7158203125, + "step": 4911 + }, + { + "epoch": 0.9746515204127189, + "grad_norm": 28.430373643008185, + "learning_rate": 1.0179484388890432e-07, + "logits/chosen": 3.859375, + "logits/rejected": 3.7734375, + "logps/chosen": -711.5, + "logps/rejected": -490.5, + "loss": 0.5206, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.0615234375, + "rewards/margins": 5.0625, + "rewards/rejected": -3.0078125, + "step": 4912 + }, + { + "epoch": 0.9748499429535196, + "grad_norm": 24.760882110124328, + "learning_rate": 1.0176714286510674e-07, + "logits/chosen": 4.25, + "logits/rejected": 4.1875, + "logps/chosen": -777.0, + "logps/rejected": -1093.5, + "loss": 0.417, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.60546875, + "rewards/margins": 8.59375, + "rewards/rejected": -5.98046875, + "step": 4913 + }, + { + "epoch": 0.9750483654943202, + "grad_norm": 29.19499520462459, + "learning_rate": 1.0173965685064857e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 4.12109375, + "logps/chosen": -968.0, + "logps/rejected": -938.5, + "loss": 0.4664, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.046875, + "rewards/margins": 8.25, + "rewards/rejected": -5.19140625, + "step": 4914 + }, + { + "epoch": 0.9752467880351208, + "grad_norm": 33.620088683869845, + "learning_rate": 1.017123858587145e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 3.86328125, + "logps/chosen": -839.5, + "logps/rejected": -556.75, + "loss": 0.3644, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.361328125, + "rewards/margins": 7.4921875, + "rewards/rejected": -5.12890625, + "step": 4915 + }, + { + "epoch": 0.9754452105759214, + "grad_norm": 39.49330797827618, + "learning_rate": 1.0168532990238586e-07, + "logits/chosen": 4.28515625, + "logits/rejected": 4.1015625, + "logps/chosen": -1075.0, + "logps/rejected": -617.0, + "loss": 0.3842, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.490234375, + "rewards/margins": 7.1953125, + "rewards/rejected": -4.703125, + "step": 4916 + }, + { + "epoch": 0.9756436331167221, + "grad_norm": 26.78639762906517, + "learning_rate": 1.0165848899464095e-07, + "logits/chosen": 4.203125, + "logits/rejected": 4.125, + "logps/chosen": -1418.0, + "logps/rejected": -857.0, + "loss": 0.3181, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0576171875, + "rewards/margins": 8.83984375, + "rewards/rejected": -5.78125, + "step": 4917 + }, + { + "epoch": 0.9758420556575227, + "grad_norm": 23.14088794263303, + "learning_rate": 1.0163186314835484e-07, + "logits/chosen": 3.984375, + "logits/rejected": 4.12890625, + "logps/chosen": -948.0, + "logps/rejected": -873.0, + "loss": 0.324, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.58984375, + "rewards/margins": 14.234375, + "rewards/rejected": -11.65625, + "step": 4918 + }, + { + "epoch": 0.9760404781983233, + "grad_norm": 26.563348875567055, + "learning_rate": 1.016054523762995e-07, + "logits/chosen": 4.109375, + "logits/rejected": 4.1328125, + "logps/chosen": -909.0, + "logps/rejected": -676.0, + "loss": 0.3299, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.591796875, + "rewards/margins": 8.0, + "rewards/rejected": -5.40625, + "step": 4919 + }, + { + "epoch": 0.976238900739124, + "grad_norm": 28.148275712432028, + "learning_rate": 1.015792566911437e-07, + "logits/chosen": 4.09765625, + "logits/rejected": 4.3671875, + "logps/chosen": -1347.0, + "logps/rejected": -1037.0, + "loss": 0.3365, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.466796875, + "rewards/margins": 9.859375, + "rewards/rejected": -7.40625, + "step": 4920 + }, + { + "epoch": 0.9764373232799246, + "grad_norm": 28.126968640491427, + "learning_rate": 1.0155327610545303e-07, + "logits/chosen": 3.94921875, + "logits/rejected": 3.95703125, + "logps/chosen": -1553.0, + "logps/rejected": -932.0, + "loss": 0.3553, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.53515625, + "rewards/margins": 9.4375, + "rewards/rejected": -6.8984375, + "step": 4921 + }, + { + "epoch": 0.9766357458207252, + "grad_norm": 24.364182842633504, + "learning_rate": 1.0152751063168995e-07, + "logits/chosen": 4.421875, + "logits/rejected": 4.109375, + "logps/chosen": -843.0, + "logps/rejected": -662.0, + "loss": 0.4302, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.5078125, + "rewards/margins": 8.2421875, + "rewards/rejected": -4.75, + "step": 4922 + }, + { + "epoch": 0.9768341683615259, + "grad_norm": 30.335479029774515, + "learning_rate": 1.0150196028221364e-07, + "logits/chosen": 4.2890625, + "logits/rejected": 4.23828125, + "logps/chosen": -1206.0, + "logps/rejected": -711.0, + "loss": 0.3992, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.744140625, + "rewards/margins": 7.828125, + "rewards/rejected": -5.08203125, + "step": 4923 + }, + { + "epoch": 0.9770325909023265, + "grad_norm": 48.99299539943507, + "learning_rate": 1.0147662506928022e-07, + "logits/chosen": 4.0, + "logits/rejected": 4.07421875, + "logps/chosen": -1373.0, + "logps/rejected": -1011.0, + "loss": 0.3265, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.08984375, + "rewards/margins": 19.03125, + "rewards/rejected": -15.8984375, + "step": 4924 + }, + { + "epoch": 0.9772310134431271, + "grad_norm": 34.854964153618525, + "learning_rate": 1.014515050050425e-07, + "logits/chosen": 3.9453125, + "logits/rejected": 3.73828125, + "logps/chosen": -640.0, + "logps/rejected": -824.5, + "loss": 0.4071, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.52734375, + "rewards/margins": 7.96875, + "rewards/rejected": -5.44921875, + "step": 4925 + }, + { + "epoch": 0.9774294359839277, + "grad_norm": 37.40550960071066, + "learning_rate": 1.0142660010155016e-07, + "logits/chosen": 3.765625, + "logits/rejected": 3.80859375, + "logps/chosen": -1000.0, + "logps/rejected": -1303.0, + "loss": 0.4072, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.763671875, + "rewards/margins": 10.1953125, + "rewards/rejected": -7.4375, + "step": 4926 + }, + { + "epoch": 0.9776278585247284, + "grad_norm": 24.88570976839655, + "learning_rate": 1.0140191037074962e-07, + "logits/chosen": 4.5234375, + "logits/rejected": 4.2109375, + "logps/chosen": -1295.0, + "logps/rejected": -866.0, + "loss": 0.3291, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.84765625, + "rewards/margins": 10.421875, + "rewards/rejected": -6.6015625, + "step": 4927 + }, + { + "epoch": 0.977826281065529, + "grad_norm": 29.462046473882953, + "learning_rate": 1.0137743582448408e-07, + "logits/chosen": 4.046875, + "logits/rejected": 4.14453125, + "logps/chosen": -1105.0, + "logps/rejected": -768.0, + "loss": 0.3543, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.59375, + "rewards/margins": 8.8125, + "rewards/rejected": -5.20703125, + "step": 4928 + }, + { + "epoch": 0.9780247036063296, + "grad_norm": 28.304404184793455, + "learning_rate": 1.013531764744936e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 3.859375, + "logps/chosen": -1314.0, + "logps/rejected": -1083.0, + "loss": 0.2153, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.125, + "rewards/margins": 9.9609375, + "rewards/rejected": -6.84375, + "step": 4929 + }, + { + "epoch": 0.9782231261471304, + "grad_norm": 23.459968203734928, + "learning_rate": 1.0132913233241489e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.25, + "logps/chosen": -1152.0, + "logps/rejected": -764.0, + "loss": 0.3733, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.75, + "rewards/margins": 8.390625, + "rewards/rejected": -4.6484375, + "step": 4930 + }, + { + "epoch": 0.978421548687931, + "grad_norm": 33.495983872727315, + "learning_rate": 1.0130530340978155e-07, + "logits/chosen": 4.43359375, + "logits/rejected": 4.31640625, + "logps/chosen": -903.5, + "logps/rejected": -778.0, + "loss": 0.3869, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.15234375, + "rewards/margins": 9.484375, + "rewards/rejected": -6.34375, + "step": 4931 + }, + { + "epoch": 0.9786199712287316, + "grad_norm": 32.131730402464605, + "learning_rate": 1.0128168971802384e-07, + "logits/chosen": 4.23046875, + "logits/rejected": 4.16796875, + "logps/chosen": -1331.0, + "logps/rejected": -985.0, + "loss": 0.3553, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.66015625, + "rewards/margins": 9.75, + "rewards/rejected": -6.078125, + "step": 4932 + }, + { + "epoch": 0.9788183937695322, + "grad_norm": 33.82924393459734, + "learning_rate": 1.0125829126846893e-07, + "logits/chosen": 3.85546875, + "logits/rejected": 3.59375, + "logps/chosen": -887.0, + "logps/rejected": -568.5, + "loss": 0.482, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.19189453125, + "rewards/margins": 5.125, + "rewards/rejected": -3.94921875, + "step": 4933 + }, + { + "epoch": 0.9790168163103329, + "grad_norm": 36.527571742896875, + "learning_rate": 1.0123510807234057e-07, + "logits/chosen": 3.7421875, + "logits/rejected": 4.171875, + "logps/chosen": -1266.0, + "logps/rejected": -1121.0, + "loss": 0.3429, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3125, + "rewards/margins": 20.1328125, + "rewards/rejected": -16.80078125, + "step": 4934 + }, + { + "epoch": 0.9792152388511335, + "grad_norm": 37.55242892426469, + "learning_rate": 1.012121401407593e-07, + "logits/chosen": 3.85546875, + "logits/rejected": 3.59765625, + "logps/chosen": -1001.0, + "logps/rejected": -642.5, + "loss": 0.6042, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.0390625, + "rewards/margins": 4.8984375, + "rewards/rejected": -2.861328125, + "step": 4935 + }, + { + "epoch": 0.9794136613919341, + "grad_norm": 51.71922802028089, + "learning_rate": 1.0118938748474248e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 3.765625, + "logps/chosen": -1478.0, + "logps/rejected": -811.0, + "loss": 0.3677, + "rewards/accuracies": 0.84375, + "rewards/chosen": 4.28125, + "rewards/margins": 9.34375, + "rewards/rejected": -5.07421875, + "step": 4936 + }, + { + "epoch": 0.9796120839327348, + "grad_norm": 31.63844334520068, + "learning_rate": 1.0116685011520412e-07, + "logits/chosen": 3.9765625, + "logits/rejected": 3.859375, + "logps/chosen": -889.0, + "logps/rejected": -602.0, + "loss": 0.4176, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5, + "rewards/margins": 6.90625, + "rewards/rejected": -4.40625, + "step": 4937 + }, + { + "epoch": 0.9798105064735354, + "grad_norm": 23.570514509710087, + "learning_rate": 1.0114452804295506e-07, + "logits/chosen": 4.4296875, + "logits/rejected": 4.29296875, + "logps/chosen": -994.5, + "logps/rejected": -669.0, + "loss": 0.4753, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.68359375, + "rewards/margins": 7.375, + "rewards/rejected": -4.6884765625, + "step": 4938 + }, + { + "epoch": 0.980008929014336, + "grad_norm": 35.72409366984399, + "learning_rate": 1.0112242127870268e-07, + "logits/chosen": 4.06640625, + "logits/rejected": 3.81640625, + "logps/chosen": -1042.0, + "logps/rejected": -790.0, + "loss": 0.3248, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.96484375, + "rewards/margins": 8.2109375, + "rewards/rejected": -4.23046875, + "step": 4939 + }, + { + "epoch": 0.9802073515551367, + "grad_norm": 25.451952108887497, + "learning_rate": 1.0110052983305134e-07, + "logits/chosen": 4.6171875, + "logits/rejected": 4.40234375, + "logps/chosen": -707.5, + "logps/rejected": -531.0, + "loss": 0.4073, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.84765625, + "rewards/margins": 6.859375, + "rewards/rejected": -4.01171875, + "step": 4940 + }, + { + "epoch": 0.9804057740959373, + "grad_norm": 26.888809551534283, + "learning_rate": 1.010788537165019e-07, + "logits/chosen": 4.0, + "logits/rejected": 3.8515625, + "logps/chosen": -863.0, + "logps/rejected": -586.5, + "loss": 0.326, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.35546875, + "rewards/margins": 8.6015625, + "rewards/rejected": -5.23828125, + "step": 4941 + }, + { + "epoch": 0.9806041966367379, + "grad_norm": 32.64868140884993, + "learning_rate": 1.01057392939452e-07, + "logits/chosen": 3.8984375, + "logits/rejected": 3.953125, + "logps/chosen": -912.0, + "logps/rejected": -982.5, + "loss": 0.3497, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5, + "rewards/margins": 16.0234375, + "rewards/rejected": -13.4921875, + "step": 4942 + }, + { + "epoch": 0.9808026191775385, + "grad_norm": 38.64854764134714, + "learning_rate": 1.0103614751219605e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 3.98046875, + "logps/chosen": -926.5, + "logps/rejected": -824.5, + "loss": 0.5976, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.06640625, + "rewards/margins": 5.22265625, + "rewards/rejected": -3.154296875, + "step": 4943 + }, + { + "epoch": 0.9810010417183392, + "grad_norm": 34.769904161439726, + "learning_rate": 1.0101511744492504e-07, + "logits/chosen": 3.4296875, + "logits/rejected": 3.41015625, + "logps/chosen": -930.0, + "logps/rejected": -697.5, + "loss": 0.5039, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.79296875, + "rewards/margins": 6.8984375, + "rewards/rejected": -5.11328125, + "step": 4944 + }, + { + "epoch": 0.9811994642591398, + "grad_norm": 29.130380713079393, + "learning_rate": 1.0099430274772676e-07, + "logits/chosen": 3.4765625, + "logits/rejected": 3.875, + "logps/chosen": -1122.0, + "logps/rejected": -1262.0, + "loss": 0.3101, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.484375, + "rewards/margins": 10.6015625, + "rewards/rejected": -8.125, + "step": 4945 + }, + { + "epoch": 0.9813978867999404, + "grad_norm": 29.967641989224898, + "learning_rate": 1.0097370343058566e-07, + "logits/chosen": 4.14453125, + "logits/rejected": 4.15625, + "logps/chosen": -1277.0, + "logps/rejected": -1067.0, + "loss": 0.3299, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.185546875, + "rewards/margins": 9.4296875, + "rewards/rejected": -6.25, + "step": 4946 + }, + { + "epoch": 0.9815963093407412, + "grad_norm": 27.461379524148615, + "learning_rate": 1.0095331950338285e-07, + "logits/chosen": 4.1796875, + "logits/rejected": 4.03125, + "logps/chosen": -1158.0, + "logps/rejected": -1095.0, + "loss": 0.3945, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.671875, + "rewards/margins": 9.5859375, + "rewards/rejected": -6.8984375, + "step": 4947 + }, + { + "epoch": 0.9817947318815418, + "grad_norm": 28.759611888259258, + "learning_rate": 1.009331509758961e-07, + "logits/chosen": 4.3125, + "logits/rejected": 4.03515625, + "logps/chosen": -1317.0, + "logps/rejected": -824.0, + "loss": 0.3702, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.94140625, + "rewards/margins": 10.1796875, + "rewards/rejected": -7.21875, + "step": 4948 + }, + { + "epoch": 0.9819931544223424, + "grad_norm": 29.88514076284108, + "learning_rate": 1.0091319785779994e-07, + "logits/chosen": 4.3359375, + "logits/rejected": 4.41015625, + "logps/chosen": -1132.0, + "logps/rejected": -1047.5, + "loss": 0.3558, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.037109375, + "rewards/margins": 9.3515625, + "rewards/rejected": -6.3125, + "step": 4949 + }, + { + "epoch": 0.982191576963143, + "grad_norm": 29.603869353040366, + "learning_rate": 1.0089346015866553e-07, + "logits/chosen": 4.0625, + "logits/rejected": 4.03515625, + "logps/chosen": -1002.0, + "logps/rejected": -633.0, + "loss": 0.4552, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8671875, + "rewards/margins": 7.6640625, + "rewards/rejected": -4.8046875, + "step": 4950 + }, + { + "epoch": 0.9823899995039437, + "grad_norm": 30.960072319433415, + "learning_rate": 1.0087393788796067e-07, + "logits/chosen": 4.3984375, + "logits/rejected": 4.51953125, + "logps/chosen": -1019.0, + "logps/rejected": -1032.0, + "loss": 0.4392, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3203125, + "rewards/margins": 8.4453125, + "rewards/rejected": -6.109375, + "step": 4951 + }, + { + "epoch": 0.9825884220447443, + "grad_norm": 32.263290439223596, + "learning_rate": 1.0085463105504982e-07, + "logits/chosen": 4.0, + "logits/rejected": 4.015625, + "logps/chosen": -923.0, + "logps/rejected": -648.5, + "loss": 0.3691, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.29296875, + "rewards/margins": 7.87890625, + "rewards/rejected": -5.587890625, + "step": 4952 + }, + { + "epoch": 0.9827868445855449, + "grad_norm": 35.89534245148753, + "learning_rate": 1.0083553966919421e-07, + "logits/chosen": 4.0625, + "logits/rejected": 4.12890625, + "logps/chosen": -1047.0, + "logps/rejected": -976.0, + "loss": 0.3609, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.658203125, + "rewards/margins": 7.8671875, + "rewards/rejected": -5.20703125, + "step": 4953 + }, + { + "epoch": 0.9829852671263456, + "grad_norm": 27.392336216969685, + "learning_rate": 1.0081666373955156e-07, + "logits/chosen": 4.5625, + "logits/rejected": 4.4453125, + "logps/chosen": -825.5, + "logps/rejected": -1968.0, + "loss": 0.4751, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.361328125, + "rewards/margins": 14.625, + "rewards/rejected": -12.2734375, + "step": 4954 + }, + { + "epoch": 0.9831836896671462, + "grad_norm": 24.567659625387815, + "learning_rate": 1.0079800327517636e-07, + "logits/chosen": 4.1953125, + "logits/rejected": 4.0546875, + "logps/chosen": -979.0, + "logps/rejected": -600.0, + "loss": 0.3563, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9453125, + "rewards/margins": 8.4921875, + "rewards/rejected": -5.54296875, + "step": 4955 + }, + { + "epoch": 0.9833821122079468, + "grad_norm": 24.91102020436316, + "learning_rate": 1.0077955828501972e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 3.6640625, + "logps/chosen": -1023.0, + "logps/rejected": -763.0, + "loss": 0.2984, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.07421875, + "rewards/margins": 9.625, + "rewards/rejected": -6.546875, + "step": 4956 + }, + { + "epoch": 0.9835805347487474, + "grad_norm": 27.351310690860114, + "learning_rate": 1.0076132877792932e-07, + "logits/chosen": 3.6328125, + "logits/rejected": 3.72265625, + "logps/chosen": -840.5, + "logps/rejected": -827.5, + "loss": 0.3976, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.837890625, + "rewards/margins": 9.3984375, + "rewards/rejected": -6.544921875, + "step": 4957 + }, + { + "epoch": 0.9837789572895481, + "grad_norm": 36.0768862713657, + "learning_rate": 1.0074331476264956e-07, + "logits/chosen": 4.2734375, + "logits/rejected": 4.21484375, + "logps/chosen": -767.0, + "logps/rejected": -816.0, + "loss": 0.4165, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3671875, + "rewards/margins": 8.9375, + "rewards/rejected": -6.5546875, + "step": 4958 + }, + { + "epoch": 0.9839773798303487, + "grad_norm": 30.959347902955347, + "learning_rate": 1.007255162478214e-07, + "logits/chosen": 3.640625, + "logits/rejected": 3.6328125, + "logps/chosen": -1422.0, + "logps/rejected": -802.0, + "loss": 0.2856, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.21484375, + "rewards/margins": 10.109375, + "rewards/rejected": -6.890625, + "step": 4959 + }, + { + "epoch": 0.9841758023711493, + "grad_norm": 33.855758570750965, + "learning_rate": 1.0070793324198253e-07, + "logits/chosen": 3.50390625, + "logits/rejected": 3.7734375, + "logps/chosen": -995.0, + "logps/rejected": -782.0, + "loss": 0.5293, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.330078125, + "rewards/margins": 6.734375, + "rewards/rejected": -4.4140625, + "step": 4960 + }, + { + "epoch": 0.98437422491195, + "grad_norm": 25.663759862905657, + "learning_rate": 1.0069056575356721e-07, + "logits/chosen": 3.6640625, + "logits/rejected": 3.69140625, + "logps/chosen": -864.0, + "logps/rejected": -669.0, + "loss": 0.3136, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.275390625, + "rewards/margins": 8.53125, + "rewards/rejected": -5.2421875, + "step": 4961 + }, + { + "epoch": 0.9845726474527506, + "grad_norm": 31.565978767716476, + "learning_rate": 1.0067341379090626e-07, + "logits/chosen": 4.4453125, + "logits/rejected": 3.9609375, + "logps/chosen": -849.0, + "logps/rejected": -508.0, + "loss": 0.4895, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.498046875, + "rewards/margins": 6.92578125, + "rewards/rejected": -4.4375, + "step": 4962 + }, + { + "epoch": 0.9847710699935512, + "grad_norm": 26.05423070040976, + "learning_rate": 1.0065647736222726e-07, + "logits/chosen": 3.4375, + "logits/rejected": 3.51171875, + "logps/chosen": -1044.0, + "logps/rejected": -673.5, + "loss": 0.4186, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.791015625, + "rewards/margins": 8.341796875, + "rewards/rejected": -5.547210693359375, + "step": 4963 + }, + { + "epoch": 0.984969492534352, + "grad_norm": 33.99938057604797, + "learning_rate": 1.006397564756542e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 3.80078125, + "logps/chosen": -947.0, + "logps/rejected": -904.0, + "loss": 0.3782, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7734375, + "rewards/margins": 7.953125, + "rewards/rejected": -5.17578125, + "step": 4964 + }, + { + "epoch": 0.9851679150751526, + "grad_norm": 31.136415859480802, + "learning_rate": 1.0062325113920786e-07, + "logits/chosen": 4.1484375, + "logits/rejected": 4.078125, + "logps/chosen": -1011.0, + "logps/rejected": -1025.25, + "loss": 0.4, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.615234375, + "rewards/margins": 8.5859375, + "rewards/rejected": -5.984375, + "step": 4965 + }, + { + "epoch": 0.9853663376159532, + "grad_norm": 26.058124538630796, + "learning_rate": 1.0060696136080554e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 3.69140625, + "logps/chosen": -934.0, + "logps/rejected": -596.0, + "loss": 0.3247, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.49609375, + "rewards/margins": 8.859375, + "rewards/rejected": -6.359375, + "step": 4966 + }, + { + "epoch": 0.9855647601567538, + "grad_norm": 38.348258652061396, + "learning_rate": 1.0059088714826117e-07, + "logits/chosen": 4.125, + "logits/rejected": 4.3203125, + "logps/chosen": -970.0, + "logps/rejected": -809.5, + "loss": 0.4505, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3492431640625, + "rewards/margins": 7.333984375, + "rewards/rejected": -4.986328125, + "step": 4967 + }, + { + "epoch": 0.9857631826975545, + "grad_norm": 29.53344070568481, + "learning_rate": 1.005750285092853e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.2890625, + "logps/chosen": -1106.0, + "logps/rejected": -697.0, + "loss": 0.4152, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.046875, + "rewards/margins": 7.2578125, + "rewards/rejected": -4.20703125, + "step": 4968 + }, + { + "epoch": 0.9859616052383551, + "grad_norm": 30.94302292923086, + "learning_rate": 1.0055938545148495e-07, + "logits/chosen": 3.765625, + "logits/rejected": 3.5859375, + "logps/chosen": -1149.0, + "logps/rejected": -828.0, + "loss": 0.3205, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.734375, + "rewards/margins": 9.0, + "rewards/rejected": -6.26171875, + "step": 4969 + }, + { + "epoch": 0.9861600277791557, + "grad_norm": 45.274079032806426, + "learning_rate": 1.0054395798236384e-07, + "logits/chosen": 4.16796875, + "logits/rejected": 4.09375, + "logps/chosen": -775.5, + "logps/rejected": -514.0, + "loss": 0.4489, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.72265625, + "rewards/margins": 6.16796875, + "rewards/rejected": -3.4482421875, + "step": 4970 + }, + { + "epoch": 0.9863584503199564, + "grad_norm": 31.342969438430803, + "learning_rate": 1.0052874610932234e-07, + "logits/chosen": 4.234375, + "logits/rejected": 4.11328125, + "logps/chosen": -947.0, + "logps/rejected": -792.5, + "loss": 0.4215, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.294921875, + "rewards/margins": 6.5234375, + "rewards/rejected": -4.23828125, + "step": 4971 + }, + { + "epoch": 0.986556872860757, + "grad_norm": 24.441517247418417, + "learning_rate": 1.0051374983965718e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 3.93359375, + "logps/chosen": -777.0, + "logps/rejected": -551.0, + "loss": 0.4001, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5, + "rewards/margins": 7.2265625, + "rewards/rejected": -4.71875, + "step": 4972 + }, + { + "epoch": 0.9867552954015576, + "grad_norm": 34.498983377311376, + "learning_rate": 1.0049896918056192e-07, + "logits/chosen": 4.27734375, + "logits/rejected": 4.0546875, + "logps/chosen": -628.5, + "logps/rejected": -852.0, + "loss": 0.402, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.82421875, + "rewards/margins": 18.375, + "rewards/rejected": -16.51953125, + "step": 4973 + }, + { + "epoch": 0.9869537179423582, + "grad_norm": 37.395532477177845, + "learning_rate": 1.0048440413912654e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 4.27734375, + "logps/chosen": -750.0, + "logps/rejected": -652.0, + "loss": 0.3557, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.119140625, + "rewards/margins": 6.5859375, + "rewards/rejected": -4.47265625, + "step": 4974 + }, + { + "epoch": 0.9871521404831589, + "grad_norm": 32.852030511892174, + "learning_rate": 1.004700547223376e-07, + "logits/chosen": 3.984375, + "logits/rejected": 3.85546875, + "logps/chosen": -816.0, + "logps/rejected": -748.0, + "loss": 0.5942, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.015625, + "rewards/margins": 6.390625, + "rewards/rejected": -4.37890625, + "step": 4975 + }, + { + "epoch": 0.9873505630239595, + "grad_norm": 28.38835212189289, + "learning_rate": 1.0045592093707828e-07, + "logits/chosen": 3.91015625, + "logits/rejected": 3.8046875, + "logps/chosen": -731.0, + "logps/rejected": -533.5, + "loss": 0.3787, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.271484375, + "rewards/margins": 7.2890625, + "rewards/rejected": -5.01171875, + "step": 4976 + }, + { + "epoch": 0.9875489855647601, + "grad_norm": 23.480512337379402, + "learning_rate": 1.0044200279012832e-07, + "logits/chosen": 4.00390625, + "logits/rejected": 3.859375, + "logps/chosen": -1184.0, + "logps/rejected": -693.0, + "loss": 0.3173, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.626953125, + "rewards/margins": 8.5625, + "rewards/rejected": -5.9375, + "step": 4977 + }, + { + "epoch": 0.9877474081055608, + "grad_norm": 30.462034002188716, + "learning_rate": 1.0042830028816399e-07, + "logits/chosen": 4.31640625, + "logits/rejected": 4.1484375, + "logps/chosen": -866.0, + "logps/rejected": -1268.0, + "loss": 0.391, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6953125, + "rewards/margins": 9.8046875, + "rewards/rejected": -7.1015625, + "step": 4978 + }, + { + "epoch": 0.9879458306463614, + "grad_norm": 26.35000909530479, + "learning_rate": 1.0041481343775817e-07, + "logits/chosen": 4.0859375, + "logits/rejected": 4.2265625, + "logps/chosen": -804.5, + "logps/rejected": -1559.0, + "loss": 0.4745, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.41552734375, + "rewards/margins": 10.296875, + "rewards/rejected": -7.875, + "step": 4979 + }, + { + "epoch": 0.988144253187162, + "grad_norm": 39.52320617503776, + "learning_rate": 1.0040154224538022e-07, + "logits/chosen": 3.92578125, + "logits/rejected": 3.9140625, + "logps/chosen": -1096.0, + "logps/rejected": -1516.0, + "loss": 0.4523, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.783203125, + "rewards/margins": 9.546875, + "rewards/rejected": -6.76171875, + "step": 4980 + }, + { + "epoch": 0.9883426757279627, + "grad_norm": 33.6360871853142, + "learning_rate": 1.0038848671739613e-07, + "logits/chosen": 4.0234375, + "logits/rejected": 3.7734375, + "logps/chosen": -987.0, + "logps/rejected": -680.0, + "loss": 0.3458, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.03125, + "rewards/margins": 7.828125, + "rewards/rejected": -4.7890625, + "step": 4981 + }, + { + "epoch": 0.9885410982687634, + "grad_norm": 25.859253866725396, + "learning_rate": 1.0037564686006837e-07, + "logits/chosen": 4.12109375, + "logits/rejected": 4.05078125, + "logps/chosen": -1172.0, + "logps/rejected": -860.0, + "loss": 0.3686, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.15234375, + "rewards/margins": 9.21875, + "rewards/rejected": -6.0703125, + "step": 4982 + }, + { + "epoch": 0.988739520809564, + "grad_norm": 31.590653438589086, + "learning_rate": 1.0036302267955607e-07, + "logits/chosen": 4.10546875, + "logits/rejected": 3.890625, + "logps/chosen": -931.5, + "logps/rejected": -821.5, + "loss": 0.3891, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2109375, + "rewards/margins": 7.5, + "rewards/rejected": -4.2890625, + "step": 4983 + }, + { + "epoch": 0.9889379433503646, + "grad_norm": 34.49513419038822, + "learning_rate": 1.0035061418191468e-07, + "logits/chosen": 4.3671875, + "logits/rejected": 4.06640625, + "logps/chosen": -962.5, + "logps/rejected": -619.0, + "loss": 0.4404, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.89453125, + "rewards/margins": 8.4453125, + "rewards/rejected": -5.5234375, + "step": 4984 + }, + { + "epoch": 0.9891363658911653, + "grad_norm": 25.738892794540945, + "learning_rate": 1.0033842137309648e-07, + "logits/chosen": 4.015625, + "logits/rejected": 4.34375, + "logps/chosen": -1079.0, + "logps/rejected": -693.0, + "loss": 0.3471, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2109375, + "rewards/margins": 8.2265625, + "rewards/rejected": -5.01953125, + "step": 4985 + }, + { + "epoch": 0.9893347884319659, + "grad_norm": 35.24456004076644, + "learning_rate": 1.0032644425895007e-07, + "logits/chosen": 4.609375, + "logits/rejected": 4.421875, + "logps/chosen": -733.0, + "logps/rejected": -716.0, + "loss": 0.3866, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.42578125, + "rewards/margins": 7.9453125, + "rewards/rejected": -4.515625, + "step": 4986 + }, + { + "epoch": 0.9895332109727665, + "grad_norm": 39.81162577349818, + "learning_rate": 1.0031468284522062e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.55078125, + "logps/chosen": -1003.0, + "logps/rejected": -757.0, + "loss": 0.3528, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.841796875, + "rewards/margins": 9.1015625, + "rewards/rejected": -6.2421875, + "step": 4987 + }, + { + "epoch": 0.9897316335135672, + "grad_norm": 31.725649502171574, + "learning_rate": 1.0030313713755001e-07, + "logits/chosen": 4.296875, + "logits/rejected": 4.4375, + "logps/chosen": -1358.0, + "logps/rejected": -1679.5, + "loss": 0.1962, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.40625, + "rewards/margins": 14.109375, + "rewards/rejected": -9.6796875, + "step": 4988 + }, + { + "epoch": 0.9899300560543678, + "grad_norm": 27.036058690285913, + "learning_rate": 1.0029180714147636e-07, + "logits/chosen": 4.18359375, + "logits/rejected": 4.20703125, + "logps/chosen": -1069.0, + "logps/rejected": -667.5, + "loss": 0.3745, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8515625, + "rewards/margins": 7.953125, + "rewards/rejected": -5.09375, + "step": 4989 + }, + { + "epoch": 0.9901284785951684, + "grad_norm": 23.44301571243322, + "learning_rate": 1.002806928624346e-07, + "logits/chosen": 4.36328125, + "logits/rejected": 4.19921875, + "logps/chosen": -1307.0, + "logps/rejected": -1287.0, + "loss": 0.3329, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1171875, + "rewards/margins": 10.78125, + "rewards/rejected": -7.671875, + "step": 4990 + }, + { + "epoch": 0.990326901135969, + "grad_norm": 31.41096557157405, + "learning_rate": 1.0026979430575594e-07, + "logits/chosen": 4.0703125, + "logits/rejected": 4.03125, + "logps/chosen": -1125.0, + "logps/rejected": -922.0, + "loss": 0.3517, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.13671875, + "rewards/margins": 8.21875, + "rewards/rejected": -5.08203125, + "step": 4991 + }, + { + "epoch": 0.9905253236767697, + "grad_norm": 24.985478314284165, + "learning_rate": 1.002591114766683e-07, + "logits/chosen": 3.89453125, + "logits/rejected": 3.7734375, + "logps/chosen": -998.0, + "logps/rejected": -669.0, + "loss": 0.3221, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.87890625, + "rewards/margins": 8.875, + "rewards/rejected": -6.0078125, + "step": 4992 + }, + { + "epoch": 0.9907237462175703, + "grad_norm": 33.92716519004069, + "learning_rate": 1.00248644380296e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 3.9453125, + "logps/chosen": -1054.0, + "logps/rejected": -606.5, + "loss": 0.3578, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.767578125, + "rewards/margins": 7.375, + "rewards/rejected": -4.59375, + "step": 4993 + }, + { + "epoch": 0.9909221687583709, + "grad_norm": 30.44424306235204, + "learning_rate": 1.0023839302165987e-07, + "logits/chosen": 3.734375, + "logits/rejected": 4.02734375, + "logps/chosen": -859.0, + "logps/rejected": -1297.0, + "loss": 0.375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.04296875, + "rewards/margins": 10.265625, + "rewards/rejected": -7.2109375, + "step": 4994 + }, + { + "epoch": 0.9911205912991716, + "grad_norm": 29.46078599887004, + "learning_rate": 1.0022835740567744e-07, + "logits/chosen": 4.32421875, + "logits/rejected": 4.0859375, + "logps/chosen": -1355.0, + "logps/rejected": -1036.0, + "loss": 0.2733, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0390625, + "rewards/margins": 9.7421875, + "rewards/rejected": -6.703125, + "step": 4995 + }, + { + "epoch": 0.9913190138399722, + "grad_norm": 35.342415811441604, + "learning_rate": 1.0021853753716256e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 4.1328125, + "logps/chosen": -969.0, + "logps/rejected": -875.0, + "loss": 0.3995, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.123046875, + "rewards/margins": 9.9296875, + "rewards/rejected": -7.8125, + "step": 4996 + }, + { + "epoch": 0.9915174363807728, + "grad_norm": 28.059261172007513, + "learning_rate": 1.002089334208256e-07, + "logits/chosen": 3.40234375, + "logits/rejected": 3.48828125, + "logps/chosen": -1029.0, + "logps/rejected": -671.5, + "loss": 0.3484, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.14453125, + "rewards/margins": 8.6875, + "rewards/rejected": -5.53125, + "step": 4997 + }, + { + "epoch": 0.9917158589215735, + "grad_norm": 24.44937025811403, + "learning_rate": 1.0019954506127353e-07, + "logits/chosen": 3.84765625, + "logits/rejected": 3.7734375, + "logps/chosen": -944.0, + "logps/rejected": -721.0, + "loss": 0.4499, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.18359375, + "rewards/margins": 7.9140625, + "rewards/rejected": -5.7265625, + "step": 4998 + }, + { + "epoch": 0.9919142814623741, + "grad_norm": 28.310272830637604, + "learning_rate": 1.0019037246300974e-07, + "logits/chosen": 4.02734375, + "logits/rejected": 3.8359375, + "logps/chosen": -1181.0, + "logps/rejected": -819.0, + "loss": 0.2415, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.04296875, + "rewards/margins": 10.0859375, + "rewards/rejected": -7.03125, + "step": 4999 + }, + { + "epoch": 0.9921127040031748, + "grad_norm": 22.897920518899117, + "learning_rate": 1.0018141563043421e-07, + "logits/chosen": 3.671875, + "logits/rejected": 3.734375, + "logps/chosen": -868.0, + "logps/rejected": -900.5, + "loss": 0.4591, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.163330078125, + "rewards/margins": 7.4296875, + "rewards/rejected": -6.2734375, + "step": 5000 + }, + { + "epoch": 0.9923111265439754, + "grad_norm": 36.95584573374834, + "learning_rate": 1.0017267456784337e-07, + "logits/chosen": 4.6640625, + "logits/rejected": 4.34375, + "logps/chosen": -1141.0, + "logps/rejected": -740.0, + "loss": 0.4281, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.92578125, + "rewards/margins": 7.546875, + "rewards/rejected": -4.625, + "step": 5001 + }, + { + "epoch": 0.9925095490847761, + "grad_norm": 28.09377948528054, + "learning_rate": 1.0016414927943009e-07, + "logits/chosen": 3.609375, + "logits/rejected": 3.62109375, + "logps/chosen": -1232.0, + "logps/rejected": -1054.0, + "loss": 0.335, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.267578125, + "rewards/margins": 10.390625, + "rewards/rejected": -8.1171875, + "step": 5002 + }, + { + "epoch": 0.9927079716255767, + "grad_norm": 28.240331685432857, + "learning_rate": 1.0015583976928382e-07, + "logits/chosen": 4.1171875, + "logits/rejected": 3.87890625, + "logps/chosen": -1220.0, + "logps/rejected": -682.0, + "loss": 0.3385, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8916015625, + "rewards/margins": 9.1484375, + "rewards/rejected": -6.2421875, + "step": 5003 + }, + { + "epoch": 0.9929063941663773, + "grad_norm": 29.011194539198645, + "learning_rate": 1.0014774604139058e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.1171875, + "logps/chosen": -1542.0, + "logps/rejected": -828.0, + "loss": 0.2843, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.47900390625, + "rewards/margins": 10.2109375, + "rewards/rejected": -7.7421875, + "step": 5004 + }, + { + "epoch": 0.993104816707178, + "grad_norm": 25.64951269791898, + "learning_rate": 1.001398680996327e-07, + "logits/chosen": 4.015625, + "logits/rejected": 4.22265625, + "logps/chosen": -714.5, + "logps/rejected": -628.0, + "loss": 0.47, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.6328125, + "rewards/margins": 6.02734375, + "rewards/rejected": -4.40234375, + "step": 5005 + }, + { + "epoch": 0.9933032392479786, + "grad_norm": 26.993115065581293, + "learning_rate": 1.0013220594778904e-07, + "logits/chosen": 4.15625, + "logits/rejected": 4.328125, + "logps/chosen": -1200.0, + "logps/rejected": -887.0, + "loss": 0.3961, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.5087890625, + "rewards/margins": 9.4609375, + "rewards/rejected": -5.94140625, + "step": 5006 + }, + { + "epoch": 0.9935016617887792, + "grad_norm": 37.87852420688651, + "learning_rate": 1.0012475958953513e-07, + "logits/chosen": 4.17578125, + "logits/rejected": 4.03125, + "logps/chosen": -846.0, + "logps/rejected": -771.0, + "loss": 0.3798, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.396484375, + "rewards/margins": 6.8125, + "rewards/rejected": -4.421875, + "step": 5007 + }, + { + "epoch": 0.9937000843295798, + "grad_norm": 25.880412711928873, + "learning_rate": 1.0011752902844277e-07, + "logits/chosen": 4.08984375, + "logits/rejected": 4.0546875, + "logps/chosen": -826.0, + "logps/rejected": -743.0, + "loss": 0.4027, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.345703125, + "rewards/margins": 7.609375, + "rewards/rejected": -5.28125, + "step": 5008 + }, + { + "epoch": 0.9938985068703805, + "grad_norm": 26.02076169951561, + "learning_rate": 1.0011051426798033e-07, + "logits/chosen": 4.453125, + "logits/rejected": 4.515625, + "logps/chosen": -883.0, + "logps/rejected": -1085.75, + "loss": 0.3122, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.046875, + "rewards/margins": 9.1015625, + "rewards/rejected": -7.0546875, + "step": 5009 + }, + { + "epoch": 0.9940969294111811, + "grad_norm": 35.31022275129911, + "learning_rate": 1.0010371531151269e-07, + "logits/chosen": 3.96484375, + "logits/rejected": 3.9375, + "logps/chosen": -1127.0, + "logps/rejected": -1286.0, + "loss": 0.4644, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.03759765625, + "rewards/margins": 9.75, + "rewards/rejected": -6.71484375, + "step": 5010 + }, + { + "epoch": 0.9942953519519817, + "grad_norm": 37.37876617080011, + "learning_rate": 1.0009713216230119e-07, + "logits/chosen": 4.01953125, + "logits/rejected": 3.9140625, + "logps/chosen": -1242.0, + "logps/rejected": -750.0, + "loss": 0.3997, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.673828125, + "rewards/margins": 8.6015625, + "rewards/rejected": -5.9296875, + "step": 5011 + }, + { + "epoch": 0.9944937744927824, + "grad_norm": 30.563155228643968, + "learning_rate": 1.0009076482350368e-07, + "logits/chosen": 3.9375, + "logits/rejected": 4.13671875, + "logps/chosen": -911.0, + "logps/rejected": -833.0, + "loss": 0.2983, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.859375, + "rewards/margins": 8.3203125, + "rewards/rejected": -5.46484375, + "step": 5012 + }, + { + "epoch": 0.994692197033583, + "grad_norm": 25.13180791733795, + "learning_rate": 1.000846132981744e-07, + "logits/chosen": 3.875, + "logits/rejected": 4.0078125, + "logps/chosen": -1173.0, + "logps/rejected": -1180.5, + "loss": 0.2929, + "rewards/accuracies": 0.90625, + "rewards/chosen": 4.03125, + "rewards/margins": 15.2578125, + "rewards/rejected": -11.2109375, + "step": 5013 + }, + { + "epoch": 0.9948906195743836, + "grad_norm": 36.365345977625616, + "learning_rate": 1.000786775892641e-07, + "logits/chosen": 3.9921875, + "logits/rejected": 3.875, + "logps/chosen": -1152.0, + "logps/rejected": -1258.0, + "loss": 0.2446, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.97265625, + "rewards/margins": 10.5859375, + "rewards/rejected": -7.6015625, + "step": 5014 + }, + { + "epoch": 0.9950890421151842, + "grad_norm": 33.09248608487791, + "learning_rate": 1.0007295769962013e-07, + "logits/chosen": 4.12890625, + "logits/rejected": 4.00390625, + "logps/chosen": -940.0, + "logps/rejected": -1209.0, + "loss": 0.2635, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.439453125, + "rewards/margins": 12.5859375, + "rewards/rejected": -10.1875, + "step": 5015 + }, + { + "epoch": 0.9952874646559849, + "grad_norm": 50.02438920557477, + "learning_rate": 1.0006745363198617e-07, + "logits/chosen": 3.7890625, + "logits/rejected": 3.73046875, + "logps/chosen": -1042.0, + "logps/rejected": -671.0, + "loss": 0.3859, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.85546875, + "rewards/margins": 8.0625, + "rewards/rejected": -6.20703125, + "step": 5016 + }, + { + "epoch": 0.9954858871967855, + "grad_norm": 30.6216481332931, + "learning_rate": 1.0006216538900245e-07, + "logits/chosen": 3.72265625, + "logits/rejected": 3.94140625, + "logps/chosen": -708.5, + "logps/rejected": -770.0, + "loss": 0.4491, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.880859375, + "rewards/margins": 9.109375, + "rewards/rejected": -7.21484375, + "step": 5017 + }, + { + "epoch": 0.9956843097375861, + "grad_norm": 33.30144572183603, + "learning_rate": 1.0005709297320556e-07, + "logits/chosen": 3.81640625, + "logits/rejected": 3.6796875, + "logps/chosen": -824.5, + "logps/rejected": -877.5, + "loss": 0.4242, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.34375, + "rewards/margins": 8.25, + "rewards/rejected": -6.90625, + "step": 5018 + }, + { + "epoch": 0.9958827322783869, + "grad_norm": 39.06183332654046, + "learning_rate": 1.0005223638702874e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 3.94140625, + "logps/chosen": -974.0, + "logps/rejected": -829.0, + "loss": 0.5121, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.01171875, + "rewards/margins": 6.796875, + "rewards/rejected": -4.78515625, + "step": 5019 + }, + { + "epoch": 0.9960811548191875, + "grad_norm": 49.28110672994508, + "learning_rate": 1.0004759563280165e-07, + "logits/chosen": 3.6015625, + "logits/rejected": 3.28125, + "logps/chosen": -906.0, + "logps/rejected": -603.5, + "loss": 0.386, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.341796875, + "rewards/margins": 6.890625, + "rewards/rejected": -5.5546875, + "step": 5020 + }, + { + "epoch": 0.9962795773599881, + "grad_norm": 27.90995904626946, + "learning_rate": 1.0004317071275022e-07, + "logits/chosen": 3.91796875, + "logits/rejected": 3.6484375, + "logps/chosen": -918.0, + "logps/rejected": -584.5, + "loss": 0.4227, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.32421875, + "rewards/margins": 7.21484375, + "rewards/rejected": -4.89453125, + "step": 5021 + }, + { + "epoch": 0.9964779999007888, + "grad_norm": 33.436475532449684, + "learning_rate": 1.0003896162899714e-07, + "logits/chosen": 3.90234375, + "logits/rejected": 4.078125, + "logps/chosen": -1472.0, + "logps/rejected": -1198.0, + "loss": 0.3305, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.796875, + "rewards/margins": 11.046875, + "rewards/rejected": -8.2265625, + "step": 5022 + }, + { + "epoch": 0.9966764224415894, + "grad_norm": 31.145753327427787, + "learning_rate": 1.0003496838356132e-07, + "logits/chosen": 4.21875, + "logits/rejected": 4.31640625, + "logps/chosen": -940.0, + "logps/rejected": -737.0, + "loss": 0.4002, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.283203125, + "rewards/margins": 6.6484375, + "rewards/rejected": -4.3671875, + "step": 5023 + }, + { + "epoch": 0.99687484498239, + "grad_norm": 29.035513040110157, + "learning_rate": 1.0003119097835838e-07, + "logits/chosen": 3.90625, + "logits/rejected": 3.6875, + "logps/chosen": -847.0, + "logps/rejected": -1004.5, + "loss": 0.4162, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.59765625, + "rewards/margins": 8.7734375, + "rewards/rejected": -6.16796875, + "step": 5024 + }, + { + "epoch": 0.9970732675231906, + "grad_norm": 26.840157499502016, + "learning_rate": 1.0002762941520018e-07, + "logits/chosen": 3.6171875, + "logits/rejected": 4.0703125, + "logps/chosen": -1244.0, + "logps/rejected": -1466.0, + "loss": 0.3849, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.193359375, + "rewards/margins": 11.203125, + "rewards/rejected": -8.0078125, + "step": 5025 + }, + { + "epoch": 0.9972716900639913, + "grad_norm": 33.62339463374161, + "learning_rate": 1.0002428369579515e-07, + "logits/chosen": 3.9296875, + "logits/rejected": 3.8828125, + "logps/chosen": -702.5, + "logps/rejected": -591.5, + "loss": 0.4472, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.560546875, + "rewards/margins": 5.9921875, + "rewards/rejected": -4.4375, + "step": 5026 + }, + { + "epoch": 0.9974701126047919, + "grad_norm": 32.777596092386105, + "learning_rate": 1.0002115382174822e-07, + "logits/chosen": 4.0390625, + "logits/rejected": 4.10546875, + "logps/chosen": -827.0, + "logps/rejected": -619.5, + "loss": 0.5165, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.89453125, + "rewards/margins": 5.6953125, + "rewards/rejected": -3.80859375, + "step": 5027 + }, + { + "epoch": 0.9976685351455925, + "grad_norm": 26.751309587477756, + "learning_rate": 1.0001823979456072e-07, + "logits/chosen": 3.734375, + "logits/rejected": 3.75390625, + "logps/chosen": -788.0, + "logps/rejected": -780.0, + "loss": 0.4492, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.36279296875, + "rewards/margins": 8.7578125, + "rewards/rejected": -7.390625, + "step": 5028 + }, + { + "epoch": 0.9978669576863932, + "grad_norm": 39.8879143955511, + "learning_rate": 1.0001554161563039e-07, + "logits/chosen": 3.87890625, + "logits/rejected": 4.1328125, + "logps/chosen": -824.0, + "logps/rejected": -472.5, + "loss": 0.3558, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.919921875, + "rewards/margins": 8.953125, + "rewards/rejected": -7.03125, + "step": 5029 + }, + { + "epoch": 0.9980653802271938, + "grad_norm": 25.4314817131981, + "learning_rate": 1.000130592862516e-07, + "logits/chosen": 4.6171875, + "logits/rejected": 4.59375, + "logps/chosen": -1074.0, + "logps/rejected": -992.0, + "loss": 0.4013, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.427734375, + "rewards/margins": 8.453125, + "rewards/rejected": -7.03125, + "step": 5030 + }, + { + "epoch": 0.9982638027679944, + "grad_norm": 37.23602818056576, + "learning_rate": 1.0001079280761497e-07, + "logits/chosen": 3.98046875, + "logits/rejected": 4.12109375, + "logps/chosen": -1211.0, + "logps/rejected": -770.0, + "loss": 0.3628, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.064453125, + "rewards/margins": 8.2265625, + "rewards/rejected": -5.15234375, + "step": 5031 + }, + { + "epoch": 0.998462225308795, + "grad_norm": 35.01990717627005, + "learning_rate": 1.0000874218080782e-07, + "logits/chosen": 4.609375, + "logits/rejected": 4.515625, + "logps/chosen": -897.0, + "logps/rejected": -924.0, + "loss": 0.4683, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.2490234375, + "rewards/margins": 7.1640625, + "rewards/rejected": -4.921875, + "step": 5032 + }, + { + "epoch": 0.9986606478495957, + "grad_norm": 24.669276268528375, + "learning_rate": 1.0000690740681368e-07, + "logits/chosen": 4.21875, + "logits/rejected": 3.8046875, + "logps/chosen": -1078.0, + "logps/rejected": -589.0, + "loss": 0.3448, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.63671875, + "rewards/margins": 8.7734375, + "rewards/rejected": -5.15234375, + "step": 5033 + }, + { + "epoch": 0.9988590703903963, + "grad_norm": 32.519953512680445, + "learning_rate": 1.0000528848651271e-07, + "logits/chosen": 3.99609375, + "logits/rejected": 3.94140625, + "logps/chosen": -820.0, + "logps/rejected": -669.0, + "loss": 0.4136, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.162109375, + "rewards/margins": 7.8671875, + "rewards/rejected": -5.7109375, + "step": 5034 + }, + { + "epoch": 0.9990574929311969, + "grad_norm": 33.75590133490221, + "learning_rate": 1.000038854206815e-07, + "logits/chosen": 4.3046875, + "logits/rejected": 4.4453125, + "logps/chosen": -925.0, + "logps/rejected": -884.0, + "loss": 0.3017, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.99609375, + "rewards/margins": 9.8515625, + "rewards/rejected": -6.859375, + "step": 5035 + }, + { + "epoch": 0.9992559154719977, + "grad_norm": 37.745756342248285, + "learning_rate": 1.0000269820999304e-07, + "logits/chosen": 3.9140625, + "logits/rejected": 4.0234375, + "logps/chosen": -994.0, + "logps/rejected": -723.0, + "loss": 0.5478, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.73828125, + "rewards/margins": 7.8125, + "rewards/rejected": -5.0703125, + "step": 5036 + }, + { + "epoch": 0.9994543380127983, + "grad_norm": 26.67465159984475, + "learning_rate": 1.0000172685501679e-07, + "logits/chosen": 4.28125, + "logits/rejected": 4.20703125, + "logps/chosen": -1179.0, + "logps/rejected": -933.0, + "loss": 0.304, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7890625, + "rewards/margins": 10.7421875, + "rewards/rejected": -7.953125, + "step": 5037 + }, + { + "epoch": 0.9996527605535989, + "grad_norm": 30.33663441470803, + "learning_rate": 1.0000097135621876e-07, + "logits/chosen": 3.83984375, + "logits/rejected": 3.828125, + "logps/chosen": -774.0, + "logps/rejected": -803.5, + "loss": 0.3041, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7265625, + "rewards/margins": 9.046875, + "rewards/rejected": -6.3125, + "step": 5038 + }, + { + "epoch": 0.9998511830943996, + "grad_norm": 41.47279962662163, + "learning_rate": 1.0000043171396128e-07, + "logits/chosen": 3.75, + "logits/rejected": 3.6796875, + "logps/chosen": -884.0, + "logps/rejected": -1163.0, + "loss": 0.4353, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3671875, + "rewards/margins": 9.96875, + "rewards/rejected": -7.60546875, + "step": 5039 + }, + { + "epoch": 1.0, + "grad_norm": 31.10529162692416, + "learning_rate": 1.0000010792850329e-07, + "logits/chosen": 4.489583492279053, + "logits/rejected": 4.161458492279053, + "logps/chosen": -1222.6666259765625, + "logps/rejected": -634.6666870117188, + "loss": 0.3938, + "rewards/accuracies": 0.8095238208770752, + "rewards/chosen": 3.5442707538604736, + "rewards/margins": 8.0, + "rewards/rejected": -4.4619140625, + "step": 5040 + }, + { + "epoch": 1.0, + "step": 5040, + "total_flos": 0.0, + "train_loss": 0.45319234429607314, + "train_runtime": 13193.0651, + "train_samples_per_second": 12.224, + "train_steps_per_second": 0.382 + } + ], + "logging_steps": 1, + "max_steps": 5040, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2520, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..371e3e8 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0683fbc7bf097b78cb204cbdfd45a909b699e2046190d3d3fcf5906b1f64a21 +size 7992