commit 06b1943f388986ceda452dad3a12ab5f979edb14 Author: ModelHub XC Date: Tue Apr 28 00:25:16 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: EleutherAI/PinkElephants-OpenHermes-7B-DPO Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..819c2e3 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,63 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text + + +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text + +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text + + +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +training_args.bin filter=lfs diff=lfs merge=lfs -text +rng_state_0.pth filter=lfs diff=lfs merge=lfs -text +model-00001-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.model filter=lfs diff=lfs merge=lfs -text +rng_state_2.pth filter=lfs diff=lfs merge=lfs -text +rng_state_7.pth filter=lfs diff=lfs merge=lfs -text +scheduler.pt filter=lfs diff=lfs merge=lfs -text +rng_state_1.pth filter=lfs diff=lfs merge=lfs -text +rng_state_5.pth filter=lfs diff=lfs merge=lfs -text +model-00002-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text +rng_state_4.pth filter=lfs diff=lfs merge=lfs -text +rng_state_3.pth filter=lfs diff=lfs merge=lfs -text +rng_state_6.pth filter=lfs diff=lfs merge=lfs -text +model-00003-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..c2f0960 --- /dev/null +++ b/README.md @@ -0,0 +1,48 @@ +--- +license: Apache License 2.0 +tags: [] + +#model-type: +##如 gpt、phi、llama、chatglm、baichuan 等 +#- gpt + +#domain: +##如 nlp、cv、audio、multi-modal +#- nlp + +#language: +##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa +#- cn + +#metrics: +##如 CIDEr、Blue、ROUGE 等 +#- CIDEr + +#tags: +##各种自定义,包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他 +#- pretrained + +#tools: +##如 vllm、fastchat、llamacpp、AdaSeq 等 +#- vllm +--- +### 当前模型的贡献者未提供更加详细的模型介绍。模型文件和权重,可浏览“模型文件”页面获取。 +#### 您可以通过如下git clone命令,或者ModelScope SDK来下载模型 + +SDK下载 +```bash +#安装ModelScope +pip install modelscope +``` +```python +#SDK模型下载 +from modelscope import snapshot_download +model_dir = snapshot_download('EleutherAI/PinkElephants-OpenHermes-7B-DPO') +``` +Git下载 +``` +#Git模型下载 +git clone https://www.modelscope.cn/EleutherAI/PinkElephants-OpenHermes-7B-DPO.git +``` + +

如果您是本模型的贡献者,我们邀请您根据模型贡献文档,及时完善模型卡片内容。

\ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..c0d07ff --- /dev/null +++ b/config.json @@ -0,0 +1,28 @@ +{ + "_name_or_path": "teknium/OpenHermes-7B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 4096, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "pad_token_id": 0, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.35.0", + "use_cache": false, + "vocab_size": 32000 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..972c9af --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 0, + "transformers_version": "4.35.0", + "use_cache": false +} diff --git a/latest b/latest new file mode 100644 index 0000000..4745bfb --- /dev/null +++ b/latest @@ -0,0 +1 @@ +global_step6500 \ No newline at end of file diff --git a/model-00001-of-00003.safetensors b/model-00001-of-00003.safetensors new file mode 100644 index 0000000..558bcf3 --- /dev/null +++ b/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4909759fb0b076360379f38060ee3bbb190548b2e8c20b1bac0ceaf9fc9aa25c +size 4938985352 diff --git a/model-00002-of-00003.safetensors b/model-00002-of-00003.safetensors new file mode 100644 index 0000000..62af09e --- /dev/null +++ b/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bac0946f93f7eef407aaa89d9af647590c69f735897cf893d0cc6272822c091c +size 4947390880 diff --git a/model-00003-of-00003.safetensors b/model-00003-of-00003.safetensors new file mode 100644 index 0000000..eb37fa3 --- /dev/null +++ b/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85d7170b54ed4cc4d6c24069851a8cf582f7cfc65f8830d25aa2f542fba7a69c +size 3590488816 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..13674e5 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 13476831232 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.norm.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/rng_state_0.pth b/rng_state_0.pth new file mode 100644 index 0000000..b924077 --- /dev/null +++ b/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82b24325d81a044e8bbd09e746c37b036af86adf4fb1c64040a20637033745ff +size 15920 diff --git a/rng_state_1.pth b/rng_state_1.pth new file mode 100644 index 0000000..96472bc --- /dev/null +++ b/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a721ae4b835fe5dbd62ae8583d1b6c36f1608b78bab6c212f77501a374ef670 +size 15920 diff --git a/rng_state_2.pth b/rng_state_2.pth new file mode 100644 index 0000000..3f119c4 --- /dev/null +++ b/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46bba9eec4c436392623174c5297f32e086c54c3140a0d4dc78fe0eafda10af1 +size 15920 diff --git a/rng_state_3.pth b/rng_state_3.pth new file mode 100644 index 0000000..10d3448 --- /dev/null +++ b/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96f4e029d6c401233af5c654d3453d3a8956f45b2bf7dabb03b521f1df3b8677 +size 15920 diff --git a/rng_state_4.pth b/rng_state_4.pth new file mode 100644 index 0000000..911ceaf --- /dev/null +++ b/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:068b9d6baa376cfeffc2af79200341998fd6d667c35e6d2d397082dee1fd0061 +size 15920 diff --git a/rng_state_5.pth b/rng_state_5.pth new file mode 100644 index 0000000..4388d8e --- /dev/null +++ b/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a4bd643add773562f4311bdaac476f7f4949255247062a840bac59899643aa0 +size 15920 diff --git a/rng_state_6.pth b/rng_state_6.pth new file mode 100644 index 0000000..ea59ab1 --- /dev/null +++ b/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38ae37bdef2c43502fb6e643a18603e677e727d6efb83b12df5288d1aef2c232 +size 15920 diff --git a/rng_state_7.pth b/rng_state_7.pth new file mode 100644 index 0000000..de32ef7 --- /dev/null +++ b/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30143e9d5dbc43a871356723e83ae85e73f3fc9cc3fc5d82b46017907ade2aac +size 15920 diff --git a/scheduler.pt b/scheduler.pt new file mode 100644 index 0000000..5aca10b --- /dev/null +++ b/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9557a57f742a19df95c76b6d6267ac96bdae20526566d4214c0a2d1d82397768 +size 1064 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..8bedc05 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..21779d6 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcd04f0eadf90287bd26e1a183ac487d8a141b09b06aecb7725bbdd343640f2e +size 1842767 diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000..6c00c74 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..67e4172 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "trust_remote_code": false, + "unk_token": "", + "use_default_system_prompt": true, + "use_fast": true +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..4215fac --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,10173 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9666818804198996, + "eval_steps": 100, + "global_step": 6500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.559270516717325e-10, + "logits/chosen": -0.9643518328666687, + "logits/rejected": -0.9552459120750427, + "logps/chosen": -76.82135772705078, + "logps/rejected": -59.52644348144531, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 4.559270516717325e-09, + "logits/chosen": -0.9517099857330322, + "logits/rejected": -0.9616715908050537, + "logps/chosen": -93.39585876464844, + "logps/rejected": -69.06417846679688, + "loss": 0.6969, + "rewards/accuracies": 0.3055555522441864, + "rewards/chosen": -0.04679955542087555, + "rewards/margins": -0.035246770828962326, + "rewards/rejected": -0.011552784591913223, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 9.11854103343465e-09, + "logits/chosen": -0.9816751480102539, + "logits/rejected": -1.003169298171997, + "logps/chosen": -90.08583068847656, + "logps/rejected": -69.84603118896484, + "loss": 0.6993, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.00020122528076171875, + "rewards/margins": -0.014502143487334251, + "rewards/rejected": 0.014300918206572533, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 1.3677811550151975e-08, + "logits/chosen": -0.9795465469360352, + "logits/rejected": -0.9944146275520325, + "logps/chosen": -90.3318099975586, + "logps/rejected": -66.77433013916016, + "loss": 0.6898, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.007479047868400812, + "rewards/margins": 0.011048078536987305, + "rewards/rejected": -0.0035690306685864925, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 1.82370820668693e-08, + "logits/chosen": -0.9575172662734985, + "logits/rejected": -0.974955677986145, + "logps/chosen": -94.9634017944336, + "logps/rejected": -64.13943481445312, + "loss": 0.6901, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.020472276955842972, + "rewards/margins": -0.0019337296253070235, + "rewards/rejected": -0.018538545817136765, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 2.2796352583586623e-08, + "logits/chosen": -0.9589303731918335, + "logits/rejected": -0.9642618894577026, + "logps/chosen": -96.45872497558594, + "logps/rejected": -69.0987319946289, + "loss": 0.679, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0022797822020947933, + "rewards/margins": 0.03321406990289688, + "rewards/rejected": -0.03549385070800781, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 2.735562310030395e-08, + "logits/chosen": -0.9560245275497437, + "logits/rejected": -0.9745736122131348, + "logps/chosen": -90.70116424560547, + "logps/rejected": -70.5436782836914, + "loss": 0.6589, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.014189362525939941, + "rewards/margins": 0.053305577486753464, + "rewards/rejected": -0.03911621496081352, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 3.191489361702128e-08, + "logits/chosen": -1.006732702255249, + "logits/rejected": -1.0067856311798096, + "logps/chosen": -86.19349670410156, + "logps/rejected": -66.15892028808594, + "loss": 0.6445, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.066792331635952, + "rewards/margins": 0.12021535634994507, + "rewards/rejected": -0.05342302471399307, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 3.64741641337386e-08, + "logits/chosen": -1.0003823041915894, + "logits/rejected": -1.0047554969787598, + "logps/chosen": -96.32222747802734, + "logps/rejected": -68.91173553466797, + "loss": 0.6131, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.09268401563167572, + "rewards/margins": 0.19902931153774261, + "rewards/rejected": -0.1063452959060669, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.1033434650455923e-08, + "logits/chosen": -0.9865853190422058, + "logits/rejected": -0.9936298131942749, + "logps/chosen": -89.23389434814453, + "logps/rejected": -64.02348327636719, + "loss": 0.5757, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.13552097976207733, + "rewards/margins": 0.2616090774536133, + "rewards/rejected": -0.12608811259269714, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.559270516717325e-08, + "logits/chosen": -0.9565617442131042, + "logits/rejected": -0.9617575407028198, + "logps/chosen": -91.22322845458984, + "logps/rejected": -68.244140625, + "loss": 0.5412, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.14585547149181366, + "rewards/margins": 0.3651939332485199, + "rewards/rejected": -0.21933846175670624, + "step": 100 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -0.9840940237045288, + "eval_logits/rejected": -0.9992539286613464, + "eval_logps/chosen": -89.1026611328125, + "eval_logps/rejected": -65.7355728149414, + "eval_loss": 0.5342935919761658, + "eval_rewards/accuracies": 0.9441340565681458, + "eval_rewards/chosen": 0.16487935185432434, + "eval_rewards/margins": 0.3618040084838867, + "eval_rewards/rejected": -0.19692467153072357, + "eval_runtime": 71.4873, + "eval_samples_per_second": 40.035, + "eval_steps_per_second": 2.504, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 5.015197568389058e-08, + "logits/chosen": -0.9964181780815125, + "logits/rejected": -1.000222086906433, + "logps/chosen": -89.18244934082031, + "logps/rejected": -67.94490814208984, + "loss": 0.5304, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.12820395827293396, + "rewards/margins": 0.37578168511390686, + "rewards/rejected": -0.2475777566432953, + "step": 110 + }, + { + "epoch": 0.05, + "learning_rate": 5.47112462006079e-08, + "logits/chosen": -0.9542981386184692, + "logits/rejected": -0.9645611643791199, + "logps/chosen": -85.86537170410156, + "logps/rejected": -67.51045989990234, + "loss": 0.4784, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.18605713546276093, + "rewards/margins": 0.5225586891174316, + "rewards/rejected": -0.3365015983581543, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 5.9270516717325223e-08, + "logits/chosen": -0.9908889532089233, + "logits/rejected": -0.9953571557998657, + "logps/chosen": -92.44700622558594, + "logps/rejected": -69.91798400878906, + "loss": 0.3954, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.338260680437088, + "rewards/margins": 0.743990957736969, + "rewards/rejected": -0.4057301878929138, + "step": 130 + }, + { + "epoch": 0.06, + "learning_rate": 6.382978723404255e-08, + "logits/chosen": -0.954239010810852, + "logits/rejected": -0.968449592590332, + "logps/chosen": -91.10675811767578, + "logps/rejected": -67.44385528564453, + "loss": 0.3584, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.36729928851127625, + "rewards/margins": 0.9051514863967896, + "rewards/rejected": -0.5378521680831909, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 6.838905775075987e-08, + "logits/chosen": -0.9643675088882446, + "logits/rejected": -0.9654477834701538, + "logps/chosen": -93.19139099121094, + "logps/rejected": -69.21319580078125, + "loss": 0.3223, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.491039901971817, + "rewards/margins": 1.1034448146820068, + "rewards/rejected": -0.6124049425125122, + "step": 150 + }, + { + "epoch": 0.07, + "learning_rate": 7.29483282674772e-08, + "logits/chosen": -0.9584270715713501, + "logits/rejected": -0.9729134440422058, + "logps/chosen": -93.25248718261719, + "logps/rejected": -67.92393493652344, + "loss": 0.3027, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5195748805999756, + "rewards/margins": 1.1697877645492554, + "rewards/rejected": -0.6502128839492798, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 7.750759878419453e-08, + "logits/chosen": -0.9704807996749878, + "logits/rejected": -0.9716874361038208, + "logps/chosen": -83.62016296386719, + "logps/rejected": -66.22118377685547, + "loss": 0.3037, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.4028417468070984, + "rewards/margins": 1.2314786911010742, + "rewards/rejected": -0.8286369442939758, + "step": 170 + }, + { + "epoch": 0.08, + "learning_rate": 8.206686930091185e-08, + "logits/chosen": -0.9470335245132446, + "logits/rejected": -0.963607668876648, + "logps/chosen": -87.67333984375, + "logps/rejected": -67.34207916259766, + "loss": 0.2843, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5842958688735962, + "rewards/margins": 1.32895028591156, + "rewards/rejected": -0.7446545362472534, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 8.662613981762918e-08, + "logits/chosen": -0.965084433555603, + "logits/rejected": -0.9808289408683777, + "logps/chosen": -89.90495300292969, + "logps/rejected": -65.31876373291016, + "loss": 0.2496, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6354175806045532, + "rewards/margins": 1.4908440113067627, + "rewards/rejected": -0.8554266095161438, + "step": 190 + }, + { + "epoch": 0.09, + "learning_rate": 9.11854103343465e-08, + "logits/chosen": -0.9802848100662231, + "logits/rejected": -0.9860008358955383, + "logps/chosen": -83.04852294921875, + "logps/rejected": -66.60576629638672, + "loss": 0.2248, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6749335527420044, + "rewards/margins": 1.5874103307724, + "rewards/rejected": -0.9124768972396851, + "step": 200 + }, + { + "epoch": 0.09, + "eval_logits/chosen": -0.9838407635688782, + "eval_logits/rejected": -0.996880292892456, + "eval_logps/chosen": -88.08007049560547, + "eval_logps/rejected": -67.23590850830078, + "eval_loss": 0.22697897255420685, + "eval_rewards/accuracies": 0.9832402467727661, + "eval_rewards/chosen": 0.6761797666549683, + "eval_rewards/margins": 1.6232739686965942, + "eval_rewards/rejected": -0.9470942616462708, + "eval_runtime": 105.1739, + "eval_samples_per_second": 27.212, + "eval_steps_per_second": 1.702, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 9.574468085106384e-08, + "logits/chosen": -0.9623929262161255, + "logits/rejected": -0.9724255800247192, + "logps/chosen": -82.79269409179688, + "logps/rejected": -68.85140228271484, + "loss": 0.1957, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6824924349784851, + "rewards/margins": 1.8234453201293945, + "rewards/rejected": -1.1409530639648438, + "step": 210 + }, + { + "epoch": 0.1, + "learning_rate": 1.0030395136778115e-07, + "logits/chosen": -0.9426813125610352, + "logits/rejected": -0.95277339220047, + "logps/chosen": -90.78047943115234, + "logps/rejected": -70.69515991210938, + "loss": 0.1768, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7748227715492249, + "rewards/margins": 2.0056302547454834, + "rewards/rejected": -1.2308075428009033, + "step": 220 + }, + { + "epoch": 0.1, + "learning_rate": 1.0486322188449848e-07, + "logits/chosen": -0.9746893048286438, + "logits/rejected": -0.9837571978569031, + "logps/chosen": -91.14712524414062, + "logps/rejected": -69.38472747802734, + "loss": 0.1403, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9424777030944824, + "rewards/margins": 2.5599138736724854, + "rewards/rejected": -1.6174360513687134, + "step": 230 + }, + { + "epoch": 0.11, + "learning_rate": 1.094224924012158e-07, + "logits/chosen": -0.9637104868888855, + "logits/rejected": -0.9687842130661011, + "logps/chosen": -88.99737548828125, + "logps/rejected": -69.65807342529297, + "loss": 0.1228, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0617095232009888, + "rewards/margins": 2.6283512115478516, + "rewards/rejected": -1.5666416883468628, + "step": 240 + }, + { + "epoch": 0.11, + "learning_rate": 1.1398176291793313e-07, + "logits/chosen": -0.9593694806098938, + "logits/rejected": -0.9832932353019714, + "logps/chosen": -89.79000091552734, + "logps/rejected": -71.10289001464844, + "loss": 0.116, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0695412158966064, + "rewards/margins": 2.6741912364959717, + "rewards/rejected": -1.6046499013900757, + "step": 250 + }, + { + "epoch": 0.12, + "learning_rate": 1.1854103343465045e-07, + "logits/chosen": -0.9898384809494019, + "logits/rejected": -0.9927492141723633, + "logps/chosen": -84.07673645019531, + "logps/rejected": -67.94017028808594, + "loss": 0.1081, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1412315368652344, + "rewards/margins": 2.985485315322876, + "rewards/rejected": -1.8442538976669312, + "step": 260 + }, + { + "epoch": 0.12, + "learning_rate": 1.2310030395136776e-07, + "logits/chosen": -0.9738420248031616, + "logits/rejected": -0.9792343974113464, + "logps/chosen": -88.48084259033203, + "logps/rejected": -75.20460510253906, + "loss": 0.1053, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.2394745349884033, + "rewards/margins": 3.0349550247192383, + "rewards/rejected": -1.7954803705215454, + "step": 270 + }, + { + "epoch": 0.13, + "learning_rate": 1.276595744680851e-07, + "logits/chosen": -0.973858654499054, + "logits/rejected": -0.977423369884491, + "logps/chosen": -88.2381820678711, + "logps/rejected": -69.44757080078125, + "loss": 0.0878, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0679655075073242, + "rewards/margins": 3.237596035003662, + "rewards/rejected": -2.169631004333496, + "step": 280 + }, + { + "epoch": 0.13, + "learning_rate": 1.3221884498480242e-07, + "logits/chosen": -0.9805269241333008, + "logits/rejected": -0.9859040975570679, + "logps/chosen": -86.40400695800781, + "logps/rejected": -73.2696533203125, + "loss": 0.0876, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1778849363327026, + "rewards/margins": 3.2878527641296387, + "rewards/rejected": -2.1099677085876465, + "step": 290 + }, + { + "epoch": 0.14, + "learning_rate": 1.3677811550151974e-07, + "logits/chosen": -0.9615720510482788, + "logits/rejected": -0.9615543484687805, + "logps/chosen": -85.02987670898438, + "logps/rejected": -69.7044906616211, + "loss": 0.0827, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.123947262763977, + "rewards/margins": 3.40171480178833, + "rewards/rejected": -2.2777674198150635, + "step": 300 + }, + { + "epoch": 0.14, + "eval_logits/chosen": -0.9878929853439331, + "eval_logits/rejected": -0.9959621429443359, + "eval_logps/chosen": -86.76927947998047, + "eval_logps/rejected": -69.99213409423828, + "eval_loss": 0.08308280259370804, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 1.3315751552581787, + "eval_rewards/margins": 3.6567811965942383, + "eval_rewards/rejected": -2.3252058029174805, + "eval_runtime": 61.2711, + "eval_samples_per_second": 46.71, + "eval_steps_per_second": 2.921, + "step": 300 + }, + { + "epoch": 0.14, + "learning_rate": 1.4133738601823708e-07, + "logits/chosen": -0.9472341537475586, + "logits/rejected": -0.9612518548965454, + "logps/chosen": -89.58518981933594, + "logps/rejected": -70.13312530517578, + "loss": 0.0787, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4316556453704834, + "rewards/margins": 3.8420116901397705, + "rewards/rejected": -2.410356044769287, + "step": 310 + }, + { + "epoch": 0.15, + "learning_rate": 1.458966565349544e-07, + "logits/chosen": -0.9657572507858276, + "logits/rejected": -0.9684290885925293, + "logps/chosen": -86.15750122070312, + "logps/rejected": -69.44698333740234, + "loss": 0.0743, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.158479928970337, + "rewards/margins": 3.4451236724853516, + "rewards/rejected": -2.2866437435150146, + "step": 320 + }, + { + "epoch": 0.15, + "learning_rate": 1.5045592705167174e-07, + "logits/chosen": -0.9523347020149231, + "logits/rejected": -0.9598930478096008, + "logps/chosen": -91.19930267333984, + "logps/rejected": -71.31523895263672, + "loss": 0.0718, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.529072642326355, + "rewards/margins": 3.7886452674865723, + "rewards/rejected": -2.2595720291137695, + "step": 330 + }, + { + "epoch": 0.16, + "learning_rate": 1.5501519756838906e-07, + "logits/chosen": -0.9740470051765442, + "logits/rejected": -0.9752241373062134, + "logps/chosen": -82.92659759521484, + "logps/rejected": -69.77666473388672, + "loss": 0.0723, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3220869302749634, + "rewards/margins": 4.073119163513184, + "rewards/rejected": -2.7510321140289307, + "step": 340 + }, + { + "epoch": 0.16, + "learning_rate": 1.5957446808510638e-07, + "logits/chosen": -0.979371190071106, + "logits/rejected": -0.9843432307243347, + "logps/chosen": -88.71659851074219, + "logps/rejected": -76.50203704833984, + "loss": 0.0682, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.327655553817749, + "rewards/margins": 4.103621006011963, + "rewards/rejected": -2.7759652137756348, + "step": 350 + }, + { + "epoch": 0.16, + "learning_rate": 1.641337386018237e-07, + "logits/chosen": -0.9631199836730957, + "logits/rejected": -0.9769188165664673, + "logps/chosen": -88.78105163574219, + "logps/rejected": -71.40655517578125, + "loss": 0.0576, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.553839087486267, + "rewards/margins": 4.10164213180542, + "rewards/rejected": -2.547802448272705, + "step": 360 + }, + { + "epoch": 0.17, + "learning_rate": 1.6869300911854104e-07, + "logits/chosen": -0.9622189402580261, + "logits/rejected": -0.9701143503189087, + "logps/chosen": -89.26042938232422, + "logps/rejected": -75.21412658691406, + "loss": 0.0687, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.384950041770935, + "rewards/margins": 4.296316623687744, + "rewards/rejected": -2.9113662242889404, + "step": 370 + }, + { + "epoch": 0.17, + "learning_rate": 1.7325227963525835e-07, + "logits/chosen": -0.9843562245368958, + "logits/rejected": -0.9918986558914185, + "logps/chosen": -86.35823059082031, + "logps/rejected": -72.2359848022461, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5267091989517212, + "rewards/margins": 4.862682819366455, + "rewards/rejected": -3.3359732627868652, + "step": 380 + }, + { + "epoch": 0.18, + "learning_rate": 1.7781155015197567e-07, + "logits/chosen": -0.9975897073745728, + "logits/rejected": -0.9955156445503235, + "logps/chosen": -91.64771270751953, + "logps/rejected": -74.9527816772461, + "loss": 0.0481, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4639146327972412, + "rewards/margins": 5.018456935882568, + "rewards/rejected": -3.5545425415039062, + "step": 390 + }, + { + "epoch": 0.18, + "learning_rate": 1.82370820668693e-07, + "logits/chosen": -0.9872757792472839, + "logits/rejected": -0.9929397702217102, + "logps/chosen": -86.04756164550781, + "logps/rejected": -75.50514221191406, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6687867641448975, + "rewards/margins": 5.520654201507568, + "rewards/rejected": -3.851867198944092, + "step": 400 + }, + { + "epoch": 0.18, + "eval_logits/chosen": -1.001560091972351, + "eval_logits/rejected": -0.9991575479507446, + "eval_logps/chosen": -86.00261688232422, + "eval_logps/rejected": -72.45541381835938, + "eval_loss": 0.04544173553586006, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.7149040699005127, + "eval_rewards/margins": 5.271754264831543, + "eval_rewards/rejected": -3.556849956512451, + "eval_runtime": 62.4842, + "eval_samples_per_second": 45.804, + "eval_steps_per_second": 2.865, + "step": 400 + }, + { + "epoch": 0.19, + "learning_rate": 1.869300911854103e-07, + "logits/chosen": -1.0112515687942505, + "logits/rejected": -1.0055806636810303, + "logps/chosen": -86.33168029785156, + "logps/rejected": -79.20642852783203, + "loss": 0.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5995696783065796, + "rewards/margins": 5.682262420654297, + "rewards/rejected": -4.082693099975586, + "step": 410 + }, + { + "epoch": 0.19, + "learning_rate": 1.9148936170212767e-07, + "logits/chosen": -0.9795786142349243, + "logits/rejected": -0.9767486453056335, + "logps/chosen": -89.1741714477539, + "logps/rejected": -75.95233917236328, + "loss": 0.0414, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1586321592330933, + "rewards/margins": 5.276698112487793, + "rewards/rejected": -4.118066310882568, + "step": 420 + }, + { + "epoch": 0.2, + "learning_rate": 1.96048632218845e-07, + "logits/chosen": -1.0036985874176025, + "logits/rejected": -0.9971052408218384, + "logps/chosen": -86.45294189453125, + "logps/rejected": -79.10576629638672, + "loss": 0.0317, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.8366873264312744, + "rewards/margins": 6.2948222160339355, + "rewards/rejected": -4.458134651184082, + "step": 430 + }, + { + "epoch": 0.2, + "learning_rate": 2.006079027355623e-07, + "logits/chosen": -1.0170124769210815, + "logits/rejected": -1.0096765756607056, + "logps/chosen": -90.57597351074219, + "logps/rejected": -72.65457916259766, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.073103427886963, + "rewards/margins": 6.505836486816406, + "rewards/rejected": -4.432732582092285, + "step": 440 + }, + { + "epoch": 0.21, + "learning_rate": 2.0516717325227962e-07, + "logits/chosen": -1.0112982988357544, + "logits/rejected": -1.0087357759475708, + "logps/chosen": -87.71390533447266, + "logps/rejected": -78.8847427368164, + "loss": 0.037, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.966575264930725, + "rewards/margins": 7.236502170562744, + "rewards/rejected": -5.26992654800415, + "step": 450 + }, + { + "epoch": 0.21, + "learning_rate": 2.0972644376899697e-07, + "logits/chosen": -0.9955867528915405, + "logits/rejected": -0.9789519309997559, + "logps/chosen": -93.02728271484375, + "logps/rejected": -79.93946838378906, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8666505813598633, + "rewards/margins": 7.276576042175293, + "rewards/rejected": -5.409926414489746, + "step": 460 + }, + { + "epoch": 0.21, + "learning_rate": 2.1428571428571428e-07, + "logits/chosen": -1.0107711553573608, + "logits/rejected": -1.015987753868103, + "logps/chosen": -91.70677185058594, + "logps/rejected": -78.36326599121094, + "loss": 0.0279, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9241142272949219, + "rewards/margins": 7.251917839050293, + "rewards/rejected": -5.327803611755371, + "step": 470 + }, + { + "epoch": 0.22, + "learning_rate": 2.188449848024316e-07, + "logits/chosen": -1.0210905075073242, + "logits/rejected": -1.0056220293045044, + "logps/chosen": -90.60785675048828, + "logps/rejected": -77.8875961303711, + "loss": 0.0252, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.8818817138671875, + "rewards/margins": 7.938240051269531, + "rewards/rejected": -6.056358814239502, + "step": 480 + }, + { + "epoch": 0.22, + "learning_rate": 2.2340425531914892e-07, + "logits/chosen": -1.0591073036193848, + "logits/rejected": -1.0334078073501587, + "logps/chosen": -86.05088806152344, + "logps/rejected": -80.09144592285156, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9079704284667969, + "rewards/margins": 7.765076637268066, + "rewards/rejected": -5.857105255126953, + "step": 490 + }, + { + "epoch": 0.23, + "learning_rate": 2.2796352583586626e-07, + "logits/chosen": -1.044002652168274, + "logits/rejected": -1.0229895114898682, + "logps/chosen": -81.89948272705078, + "logps/rejected": -77.97576141357422, + "loss": 0.0217, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5751844644546509, + "rewards/margins": 7.957855224609375, + "rewards/rejected": -6.382671356201172, + "step": 500 + }, + { + "epoch": 0.23, + "eval_logits/chosen": -1.0691347122192383, + "eval_logits/rejected": -1.0533946752548218, + "eval_logps/chosen": -85.60762023925781, + "eval_logps/rejected": -77.9649887084961, + "eval_loss": 0.021840358152985573, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 1.912401795387268, + "eval_rewards/margins": 8.22403335571289, + "eval_rewards/rejected": -6.3116326332092285, + "eval_runtime": 60.6624, + "eval_samples_per_second": 47.179, + "eval_steps_per_second": 2.951, + "step": 500 + }, + { + "epoch": 0.23, + "learning_rate": 2.3252279635258358e-07, + "logits/chosen": -1.0656869411468506, + "logits/rejected": -1.0443495512008667, + "logps/chosen": -80.74382781982422, + "logps/rejected": -80.70405578613281, + "loss": 0.0211, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.984548807144165, + "rewards/margins": 8.708304405212402, + "rewards/rejected": -6.7237548828125, + "step": 510 + }, + { + "epoch": 0.24, + "learning_rate": 2.370820668693009e-07, + "logits/chosen": -1.0407330989837646, + "logits/rejected": -1.019323706626892, + "logps/chosen": -94.31208801269531, + "logps/rejected": -84.92703247070312, + "loss": 0.0192, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9942207336425781, + "rewards/margins": 8.950647354125977, + "rewards/rejected": -6.956425666809082, + "step": 520 + }, + { + "epoch": 0.24, + "learning_rate": 2.4164133738601824e-07, + "logits/chosen": -1.0353573560714722, + "logits/rejected": -1.0317944288253784, + "logps/chosen": -84.43415832519531, + "logps/rejected": -80.83734130859375, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.38482928276062, + "rewards/margins": 9.657920837402344, + "rewards/rejected": -7.273091793060303, + "step": 530 + }, + { + "epoch": 0.25, + "learning_rate": 2.4620060790273553e-07, + "logits/chosen": -1.0673249959945679, + "logits/rejected": -1.0390758514404297, + "logps/chosen": -86.52366638183594, + "logps/rejected": -82.38668823242188, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5419979095458984, + "rewards/margins": 9.403617858886719, + "rewards/rejected": -7.861618995666504, + "step": 540 + }, + { + "epoch": 0.25, + "learning_rate": 2.5075987841945287e-07, + "logits/chosen": -1.0771315097808838, + "logits/rejected": -1.0471489429473877, + "logps/chosen": -94.00312805175781, + "logps/rejected": -81.71351623535156, + "loss": 0.0146, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.8300154209136963, + "rewards/margins": 9.741002082824707, + "rewards/rejected": -7.910986423492432, + "step": 550 + }, + { + "epoch": 0.26, + "learning_rate": 2.553191489361702e-07, + "logits/chosen": -1.0417684316635132, + "logits/rejected": -1.0242894887924194, + "logps/chosen": -85.45355987548828, + "logps/rejected": -84.82173156738281, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.176011323928833, + "rewards/margins": 10.29346752166748, + "rewards/rejected": -8.117456436157227, + "step": 560 + }, + { + "epoch": 0.26, + "learning_rate": 2.598784194528875e-07, + "logits/chosen": -1.0521628856658936, + "logits/rejected": -1.0290045738220215, + "logps/chosen": -86.50548553466797, + "logps/rejected": -84.81304931640625, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.396401286125183, + "rewards/margins": 9.808305740356445, + "rewards/rejected": -8.411903381347656, + "step": 570 + }, + { + "epoch": 0.26, + "learning_rate": 2.6443768996960485e-07, + "logits/chosen": -1.0446751117706299, + "logits/rejected": -1.0253543853759766, + "logps/chosen": -83.4941635131836, + "logps/rejected": -81.4127197265625, + "loss": 0.0107, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4424480199813843, + "rewards/margins": 10.39416217803955, + "rewards/rejected": -8.951713562011719, + "step": 580 + }, + { + "epoch": 0.27, + "learning_rate": 2.689969604863222e-07, + "logits/chosen": -1.0527292490005493, + "logits/rejected": -1.028416633605957, + "logps/chosen": -89.69185638427734, + "logps/rejected": -84.93494415283203, + "loss": 0.016, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.6588973999023438, + "rewards/margins": 10.561676025390625, + "rewards/rejected": -8.902778625488281, + "step": 590 + }, + { + "epoch": 0.27, + "learning_rate": 2.735562310030395e-07, + "logits/chosen": -1.0560492277145386, + "logits/rejected": -1.031136155128479, + "logps/chosen": -88.6587142944336, + "logps/rejected": -89.13833618164062, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.94255793094635, + "rewards/margins": 11.583595275878906, + "rewards/rejected": -9.641037940979004, + "step": 600 + }, + { + "epoch": 0.27, + "eval_logits/chosen": -1.0850204229354858, + "eval_logits/rejected": -1.0588685274124146, + "eval_logps/chosen": -85.50906372070312, + "eval_logps/rejected": -83.90418243408203, + "eval_loss": 0.013939271681010723, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 1.961681842803955, + "eval_rewards/margins": 11.242914199829102, + "eval_rewards/rejected": -9.281231880187988, + "eval_runtime": 67.4874, + "eval_samples_per_second": 42.408, + "eval_steps_per_second": 2.652, + "step": 600 + }, + { + "epoch": 0.28, + "learning_rate": 2.781155015197568e-07, + "logits/chosen": -1.0646154880523682, + "logits/rejected": -1.0403302907943726, + "logps/chosen": -87.16560363769531, + "logps/rejected": -84.03047943115234, + "loss": 0.0133, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1723438501358032, + "rewards/margins": 10.494967460632324, + "rewards/rejected": -9.322624206542969, + "step": 610 + }, + { + "epoch": 0.28, + "learning_rate": 2.8267477203647417e-07, + "logits/chosen": -1.040969729423523, + "logits/rejected": -1.025119423866272, + "logps/chosen": -86.0352783203125, + "logps/rejected": -84.01143646240234, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.131622314453125, + "rewards/margins": 11.987198829650879, + "rewards/rejected": -9.855576515197754, + "step": 620 + }, + { + "epoch": 0.29, + "learning_rate": 2.872340425531915e-07, + "logits/chosen": -1.0877559185028076, + "logits/rejected": -1.057796835899353, + "logps/chosen": -88.63595581054688, + "logps/rejected": -93.89146423339844, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9702014923095703, + "rewards/margins": 12.47899055480957, + "rewards/rejected": -10.508790016174316, + "step": 630 + }, + { + "epoch": 0.29, + "learning_rate": 2.917933130699088e-07, + "logits/chosen": -1.0618011951446533, + "logits/rejected": -1.0364282131195068, + "logps/chosen": -83.58055114746094, + "logps/rejected": -87.19285583496094, + "loss": 0.0172, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3319017887115479, + "rewards/margins": 12.28033447265625, + "rewards/rejected": -10.948432922363281, + "step": 640 + }, + { + "epoch": 0.3, + "learning_rate": 2.9635258358662614e-07, + "logits/chosen": -1.0487759113311768, + "logits/rejected": -1.0237376689910889, + "logps/chosen": -97.61241149902344, + "logps/rejected": -89.84696197509766, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0295894145965576, + "rewards/margins": 13.6948823928833, + "rewards/rejected": -11.665290832519531, + "step": 650 + }, + { + "epoch": 0.3, + "learning_rate": 2.9989856297548603e-07, + "logits/chosen": -1.0819432735443115, + "logits/rejected": -1.0479309558868408, + "logps/chosen": -88.46907043457031, + "logps/rejected": -87.63093566894531, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6469261646270752, + "rewards/margins": 12.705039978027344, + "rewards/rejected": -11.058113098144531, + "step": 660 + }, + { + "epoch": 0.31, + "learning_rate": 2.9939137785291633e-07, + "logits/chosen": -1.0461461544036865, + "logits/rejected": -1.0262444019317627, + "logps/chosen": -88.99845886230469, + "logps/rejected": -88.98668670654297, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7188533544540405, + "rewards/margins": 12.911355972290039, + "rewards/rejected": -11.192502975463867, + "step": 670 + }, + { + "epoch": 0.31, + "learning_rate": 2.9888419273034654e-07, + "logits/chosen": -1.0983555316925049, + "logits/rejected": -1.058870792388916, + "logps/chosen": -92.72132873535156, + "logps/rejected": -91.1702880859375, + "loss": 0.0139, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1177186965942383, + "rewards/margins": 12.790249824523926, + "rewards/rejected": -10.672532081604004, + "step": 680 + }, + { + "epoch": 0.31, + "learning_rate": 2.9837700760777684e-07, + "logits/chosen": -1.0601097345352173, + "logits/rejected": -1.0348825454711914, + "logps/chosen": -86.94996643066406, + "logps/rejected": -90.38238525390625, + "loss": 0.0086, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.6195377111434937, + "rewards/margins": 13.726669311523438, + "rewards/rejected": -12.107131958007812, + "step": 690 + }, + { + "epoch": 0.32, + "learning_rate": 2.978698224852071e-07, + "logits/chosen": -1.0934697389602661, + "logits/rejected": -1.0594290494918823, + "logps/chosen": -89.61146545410156, + "logps/rejected": -92.48619842529297, + "loss": 0.0048, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.0675747394561768, + "rewards/margins": 13.449430465698242, + "rewards/rejected": -11.381856918334961, + "step": 700 + }, + { + "epoch": 0.32, + "eval_logits/chosen": -1.1140062808990479, + "eval_logits/rejected": -1.0765314102172852, + "eval_logps/chosen": -85.63339233398438, + "eval_logps/rejected": -89.27959442138672, + "eval_loss": 0.010984507389366627, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 1.8995180130004883, + "eval_rewards/margins": 13.868452072143555, + "eval_rewards/rejected": -11.968934059143066, + "eval_runtime": 63.0243, + "eval_samples_per_second": 45.411, + "eval_steps_per_second": 2.84, + "step": 700 + }, + { + "epoch": 0.32, + "learning_rate": 2.9736263736263735e-07, + "logits/chosen": -1.1225736141204834, + "logits/rejected": -1.079655408859253, + "logps/chosen": -84.65995788574219, + "logps/rejected": -93.70130920410156, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7059738636016846, + "rewards/margins": 13.980459213256836, + "rewards/rejected": -12.27448558807373, + "step": 710 + }, + { + "epoch": 0.33, + "learning_rate": 2.968554522400676e-07, + "logits/chosen": -1.0669975280761719, + "logits/rejected": -1.0285903215408325, + "logps/chosen": -89.87354278564453, + "logps/rejected": -91.52484893798828, + "loss": 0.0063, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.0469908714294434, + "rewards/margins": 14.076245307922363, + "rewards/rejected": -12.029253005981445, + "step": 720 + }, + { + "epoch": 0.33, + "learning_rate": 2.9634826711749786e-07, + "logits/chosen": -1.0944750308990479, + "logits/rejected": -1.0600286722183228, + "logps/chosen": -90.2714614868164, + "logps/rejected": -86.80066680908203, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6248250007629395, + "rewards/margins": 14.128560066223145, + "rewards/rejected": -11.503734588623047, + "step": 730 + }, + { + "epoch": 0.34, + "learning_rate": 2.958410819949281e-07, + "logits/chosen": -1.0793449878692627, + "logits/rejected": -1.0554149150848389, + "logps/chosen": -78.48091888427734, + "logps/rejected": -89.2685317993164, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.959774374961853, + "rewards/margins": 14.275179862976074, + "rewards/rejected": -12.315404891967773, + "step": 740 + }, + { + "epoch": 0.34, + "learning_rate": 2.953338968723584e-07, + "logits/chosen": -1.0978403091430664, + "logits/rejected": -1.0679986476898193, + "logps/chosen": -79.7038345336914, + "logps/rejected": -89.61735534667969, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5487966537475586, + "rewards/margins": 15.569620132446289, + "rewards/rejected": -13.020822525024414, + "step": 750 + }, + { + "epoch": 0.35, + "learning_rate": 2.948267117497887e-07, + "logits/chosen": -1.1022833585739136, + "logits/rejected": -1.0697553157806396, + "logps/chosen": -87.8471450805664, + "logps/rejected": -95.08956146240234, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.898350715637207, + "rewards/margins": 14.861310005187988, + "rewards/rejected": -11.962959289550781, + "step": 760 + }, + { + "epoch": 0.35, + "learning_rate": 2.9431952662721893e-07, + "logits/chosen": -1.0957014560699463, + "logits/rejected": -1.0692951679229736, + "logps/chosen": -85.73948669433594, + "logps/rejected": -89.58802795410156, + "loss": 0.0072, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.3441336154937744, + "rewards/margins": 13.35987377166748, + "rewards/rejected": -12.015739440917969, + "step": 770 + }, + { + "epoch": 0.36, + "learning_rate": 2.938123415046492e-07, + "logits/chosen": -1.0931169986724854, + "logits/rejected": -1.0750610828399658, + "logps/chosen": -83.94388580322266, + "logps/rejected": -90.04351043701172, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7953174114227295, + "rewards/margins": 14.375581741333008, + "rewards/rejected": -12.5802640914917, + "step": 780 + }, + { + "epoch": 0.36, + "learning_rate": 2.9330515638207944e-07, + "logits/chosen": -1.1000292301177979, + "logits/rejected": -1.0617127418518066, + "logps/chosen": -85.10647583007812, + "logps/rejected": -95.74652099609375, + "loss": 0.0055, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9242041110992432, + "rewards/margins": 15.295672416687012, + "rewards/rejected": -13.371467590332031, + "step": 790 + }, + { + "epoch": 0.37, + "learning_rate": 2.927979712595097e-07, + "logits/chosen": -1.074942708015442, + "logits/rejected": -1.0459177494049072, + "logps/chosen": -87.57856750488281, + "logps/rejected": -92.28514862060547, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.659388780593872, + "rewards/margins": 13.915122985839844, + "rewards/rejected": -12.25573444366455, + "step": 800 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -1.1495593786239624, + "eval_logits/rejected": -1.104751467704773, + "eval_logps/chosen": -85.99646759033203, + "eval_logps/rejected": -93.16914367675781, + "eval_loss": 0.01038471981883049, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 1.7179815769195557, + "eval_rewards/margins": 15.631699562072754, + "eval_rewards/rejected": -13.913717269897461, + "eval_runtime": 65.9739, + "eval_samples_per_second": 43.381, + "eval_steps_per_second": 2.713, + "step": 800 + }, + { + "epoch": 0.37, + "learning_rate": 2.9229078613694e-07, + "logits/chosen": -1.1248949766159058, + "logits/rejected": -1.086717128753662, + "logps/chosen": -86.31236267089844, + "logps/rejected": -93.25736236572266, + "loss": 0.0126, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.9837150573730469, + "rewards/margins": 15.558454513549805, + "rewards/rejected": -13.574739456176758, + "step": 810 + }, + { + "epoch": 0.37, + "learning_rate": 2.917836010143702e-07, + "logits/chosen": -1.0978708267211914, + "logits/rejected": -1.057366132736206, + "logps/chosen": -81.67508697509766, + "logps/rejected": -91.41726684570312, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.00944185256958, + "rewards/margins": 15.302815437316895, + "rewards/rejected": -13.293373107910156, + "step": 820 + }, + { + "epoch": 0.38, + "learning_rate": 2.912764158918005e-07, + "logits/chosen": -1.1044955253601074, + "logits/rejected": -1.0758087635040283, + "logps/chosen": -82.3414306640625, + "logps/rejected": -97.22112274169922, + "loss": 0.0106, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.6446453332901, + "rewards/margins": 15.301877975463867, + "rewards/rejected": -13.657232284545898, + "step": 830 + }, + { + "epoch": 0.38, + "learning_rate": 2.9076923076923076e-07, + "logits/chosen": -1.124459981918335, + "logits/rejected": -1.0775994062423706, + "logps/chosen": -90.77351379394531, + "logps/rejected": -92.74385833740234, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0721054077148438, + "rewards/margins": 16.051000595092773, + "rewards/rejected": -13.97889518737793, + "step": 840 + }, + { + "epoch": 0.39, + "learning_rate": 2.90262045646661e-07, + "logits/chosen": -1.1489288806915283, + "logits/rejected": -1.1091543436050415, + "logps/chosen": -83.20817565917969, + "logps/rejected": -99.59390258789062, + "loss": 0.0079, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.3499611616134644, + "rewards/margins": 16.251693725585938, + "rewards/rejected": -14.9017333984375, + "step": 850 + }, + { + "epoch": 0.39, + "learning_rate": 2.8975486052409127e-07, + "logits/chosen": -1.1202361583709717, + "logits/rejected": -1.0809228420257568, + "logps/chosen": -93.94721984863281, + "logps/rejected": -97.80915069580078, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6343969106674194, + "rewards/margins": 16.120519638061523, + "rewards/rejected": -14.486124038696289, + "step": 860 + }, + { + "epoch": 0.4, + "learning_rate": 2.892476754015215e-07, + "logits/chosen": -1.1433136463165283, + "logits/rejected": -1.0998663902282715, + "logps/chosen": -88.42134094238281, + "logps/rejected": -97.21299743652344, + "loss": 0.009, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.4050545692443848, + "rewards/margins": 17.111103057861328, + "rewards/rejected": -14.706045150756836, + "step": 870 + }, + { + "epoch": 0.4, + "learning_rate": 2.887404902789518e-07, + "logits/chosen": -1.1467622518539429, + "logits/rejected": -1.1034185886383057, + "logps/chosen": -93.99958801269531, + "logps/rejected": -99.74283599853516, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3127968311309814, + "rewards/margins": 16.9749698638916, + "rewards/rejected": -14.6621732711792, + "step": 880 + }, + { + "epoch": 0.41, + "learning_rate": 2.882333051563821e-07, + "logits/chosen": -1.1312350034713745, + "logits/rejected": -1.0877126455307007, + "logps/chosen": -85.80730438232422, + "logps/rejected": -98.7739028930664, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6915788650512695, + "rewards/margins": 17.439577102661133, + "rewards/rejected": -15.747998237609863, + "step": 890 + }, + { + "epoch": 0.41, + "learning_rate": 2.8772612003381234e-07, + "logits/chosen": -1.1451927423477173, + "logits/rejected": -1.1060543060302734, + "logps/chosen": -86.88992309570312, + "logps/rejected": -99.46288299560547, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0115227699279785, + "rewards/margins": 17.152305603027344, + "rewards/rejected": -15.140782356262207, + "step": 900 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -1.1781193017959595, + "eval_logits/rejected": -1.1281659603118896, + "eval_logps/chosen": -86.0967788696289, + "eval_logps/rejected": -96.05574798583984, + "eval_loss": 0.009535559453070164, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.6678271293640137, + "eval_rewards/margins": 17.024843215942383, + "eval_rewards/rejected": -15.357016563415527, + "eval_runtime": 61.9434, + "eval_samples_per_second": 46.203, + "eval_steps_per_second": 2.89, + "step": 900 + }, + { + "epoch": 0.42, + "learning_rate": 2.872189349112426e-07, + "logits/chosen": -1.138933539390564, + "logits/rejected": -1.0921571254730225, + "logps/chosen": -89.17816162109375, + "logps/rejected": -98.90169525146484, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6821825504302979, + "rewards/margins": 17.2696533203125, + "rewards/rejected": -15.587472915649414, + "step": 910 + }, + { + "epoch": 0.42, + "learning_rate": 2.8671174978867285e-07, + "logits/chosen": -1.132676362991333, + "logits/rejected": -1.0800034999847412, + "logps/chosen": -86.4345932006836, + "logps/rejected": -97.37138366699219, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.264104127883911, + "rewards/margins": 17.31032371520996, + "rewards/rejected": -15.046220779418945, + "step": 920 + }, + { + "epoch": 0.42, + "learning_rate": 2.862045646661031e-07, + "logits/chosen": -1.1458117961883545, + "logits/rejected": -1.0923669338226318, + "logps/chosen": -92.89430236816406, + "logps/rejected": -95.99363708496094, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.16265869140625, + "rewards/margins": 18.03095054626465, + "rewards/rejected": -15.868295669555664, + "step": 930 + }, + { + "epoch": 0.43, + "learning_rate": 2.8569737954353336e-07, + "logits/chosen": -1.1349055767059326, + "logits/rejected": -1.083634614944458, + "logps/chosen": -93.05912780761719, + "logps/rejected": -103.2133560180664, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1463124752044678, + "rewards/margins": 17.069040298461914, + "rewards/rejected": -15.922727584838867, + "step": 940 + }, + { + "epoch": 0.43, + "learning_rate": 2.8519019442096367e-07, + "logits/chosen": -1.14430832862854, + "logits/rejected": -1.103515386581421, + "logps/chosen": -86.96769714355469, + "logps/rejected": -103.5058364868164, + "loss": 0.0103, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6363202333450317, + "rewards/margins": 18.16317367553711, + "rewards/rejected": -17.52685546875, + "step": 950 + }, + { + "epoch": 0.44, + "learning_rate": 2.846830092983939e-07, + "logits/chosen": -1.1029218435287476, + "logits/rejected": -1.0647412538528442, + "logps/chosen": -89.83350372314453, + "logps/rejected": -97.29312133789062, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2907755374908447, + "rewards/margins": 16.094762802124023, + "rewards/rejected": -14.803988456726074, + "step": 960 + }, + { + "epoch": 0.44, + "learning_rate": 2.841758241758242e-07, + "logits/chosen": -1.1452381610870361, + "logits/rejected": -1.1065289974212646, + "logps/chosen": -86.73941802978516, + "logps/rejected": -92.15290069580078, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6819941997528076, + "rewards/margins": 15.786001205444336, + "rewards/rejected": -14.10400676727295, + "step": 970 + }, + { + "epoch": 0.45, + "learning_rate": 2.8366863905325443e-07, + "logits/chosen": -1.1149189472198486, + "logits/rejected": -1.0778940916061401, + "logps/chosen": -87.29264068603516, + "logps/rejected": -98.55472564697266, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.247238278388977, + "rewards/margins": 16.33363914489746, + "rewards/rejected": -15.086400032043457, + "step": 980 + }, + { + "epoch": 0.45, + "learning_rate": 2.831614539306847e-07, + "logits/chosen": -1.104259967803955, + "logits/rejected": -1.0659714937210083, + "logps/chosen": -83.01313018798828, + "logps/rejected": -93.28031921386719, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3180642127990723, + "rewards/margins": 17.55698585510254, + "rewards/rejected": -15.238920211791992, + "step": 990 + }, + { + "epoch": 0.46, + "learning_rate": 2.8265426880811494e-07, + "logits/chosen": -1.1296952962875366, + "logits/rejected": -1.080012559890747, + "logps/chosen": -88.91751098632812, + "logps/rejected": -97.22592163085938, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.736224889755249, + "rewards/margins": 17.38802719116211, + "rewards/rejected": -15.651802062988281, + "step": 1000 + }, + { + "epoch": 0.46, + "eval_logits/chosen": -1.1499282121658325, + "eval_logits/rejected": -1.1065919399261475, + "eval_logps/chosen": -85.68370056152344, + "eval_logps/rejected": -95.3034439086914, + "eval_loss": 0.008165295235812664, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 1.8743646144866943, + "eval_rewards/margins": 16.85523223876953, + "eval_rewards/rejected": -14.980865478515625, + "eval_runtime": 64.5918, + "eval_samples_per_second": 44.309, + "eval_steps_per_second": 2.771, + "step": 1000 + }, + { + "epoch": 0.46, + "learning_rate": 2.821470836855452e-07, + "logits/chosen": -1.105207085609436, + "logits/rejected": -1.0650384426116943, + "logps/chosen": -86.5456314086914, + "logps/rejected": -99.33902740478516, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.171086072921753, + "rewards/margins": 16.355039596557617, + "rewards/rejected": -15.183954238891602, + "step": 1010 + }, + { + "epoch": 0.47, + "learning_rate": 2.8163989856297545e-07, + "logits/chosen": -1.093670129776001, + "logits/rejected": -1.0636560916900635, + "logps/chosen": -84.88511657714844, + "logps/rejected": -100.3253173828125, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8782209157943726, + "rewards/margins": 17.67445945739746, + "rewards/rejected": -15.796239852905273, + "step": 1020 + }, + { + "epoch": 0.47, + "learning_rate": 2.8113271344040575e-07, + "logits/chosen": -1.1384307146072388, + "logits/rejected": -1.0990992784500122, + "logps/chosen": -82.6616439819336, + "logps/rejected": -98.50576782226562, + "loss": 0.0067, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3514288663864136, + "rewards/margins": 17.497568130493164, + "rewards/rejected": -16.14613914489746, + "step": 1030 + }, + { + "epoch": 0.47, + "learning_rate": 2.80625528317836e-07, + "logits/chosen": -1.099442481994629, + "logits/rejected": -1.072345495223999, + "logps/chosen": -94.7537612915039, + "logps/rejected": -99.29774475097656, + "loss": 0.0079, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.425427198410034, + "rewards/margins": 17.845937728881836, + "rewards/rejected": -15.420511245727539, + "step": 1040 + }, + { + "epoch": 0.48, + "learning_rate": 2.8011834319526626e-07, + "logits/chosen": -1.1023457050323486, + "logits/rejected": -1.0619539022445679, + "logps/chosen": -89.6052474975586, + "logps/rejected": -101.50379943847656, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8860256671905518, + "rewards/margins": 18.8059024810791, + "rewards/rejected": -16.919872283935547, + "step": 1050 + }, + { + "epoch": 0.48, + "learning_rate": 2.796111580726965e-07, + "logits/chosen": -1.1182763576507568, + "logits/rejected": -1.083601474761963, + "logps/chosen": -86.69522857666016, + "logps/rejected": -98.14034271240234, + "loss": 0.0078, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7847849130630493, + "rewards/margins": 16.757022857666016, + "rewards/rejected": -15.972234725952148, + "step": 1060 + }, + { + "epoch": 0.49, + "learning_rate": 2.7910397295012677e-07, + "logits/chosen": -1.0921047925949097, + "logits/rejected": -1.058253526687622, + "logps/chosen": -88.3211898803711, + "logps/rejected": -100.35057067871094, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.095982074737549, + "rewards/margins": 18.4968318939209, + "rewards/rejected": -16.40085220336914, + "step": 1070 + }, + { + "epoch": 0.49, + "learning_rate": 2.78596787827557e-07, + "logits/chosen": -1.1086832284927368, + "logits/rejected": -1.074639916419983, + "logps/chosen": -88.3951416015625, + "logps/rejected": -101.92322540283203, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5710289478302, + "rewards/margins": 18.696849822998047, + "rewards/rejected": -16.12582015991211, + "step": 1080 + }, + { + "epoch": 0.5, + "learning_rate": 2.7808960270498733e-07, + "logits/chosen": -1.1255228519439697, + "logits/rejected": -1.0840017795562744, + "logps/chosen": -84.0284194946289, + "logps/rejected": -103.49464416503906, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7648484706878662, + "rewards/margins": 18.157766342163086, + "rewards/rejected": -16.39291763305664, + "step": 1090 + }, + { + "epoch": 0.5, + "learning_rate": 2.775824175824176e-07, + "logits/chosen": -1.1340689659118652, + "logits/rejected": -1.095685362815857, + "logps/chosen": -86.91542053222656, + "logps/rejected": -100.10322570800781, + "loss": 0.0105, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1016123294830322, + "rewards/margins": 17.42715835571289, + "rewards/rejected": -16.325546264648438, + "step": 1100 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -1.1586607694625854, + "eval_logits/rejected": -1.1157867908477783, + "eval_logps/chosen": -85.93265533447266, + "eval_logps/rejected": -96.73734283447266, + "eval_loss": 0.007867630571126938, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.7498830556869507, + "eval_rewards/margins": 17.447694778442383, + "eval_rewards/rejected": -15.6978120803833, + "eval_runtime": 64.3858, + "eval_samples_per_second": 44.451, + "eval_steps_per_second": 2.78, + "step": 1100 + }, + { + "epoch": 0.51, + "learning_rate": 2.7707523245984784e-07, + "logits/chosen": -1.1197665929794312, + "logits/rejected": -1.078808307647705, + "logps/chosen": -92.86619567871094, + "logps/rejected": -102.52835845947266, + "loss": 0.0051, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6940488815307617, + "rewards/margins": 17.81356430053711, + "rewards/rejected": -16.11951446533203, + "step": 1110 + }, + { + "epoch": 0.51, + "learning_rate": 2.765680473372781e-07, + "logits/chosen": -1.1259820461273193, + "logits/rejected": -1.0869123935699463, + "logps/chosen": -93.55479431152344, + "logps/rejected": -95.98357391357422, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1045427322387695, + "rewards/margins": 17.43987464904785, + "rewards/rejected": -15.335331916809082, + "step": 1120 + }, + { + "epoch": 0.52, + "learning_rate": 2.7606086221470835e-07, + "logits/chosen": -1.137892246246338, + "logits/rejected": -1.0994970798492432, + "logps/chosen": -90.780517578125, + "logps/rejected": -102.3265380859375, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.290585517883301, + "rewards/margins": 18.35919761657715, + "rewards/rejected": -16.068613052368164, + "step": 1130 + }, + { + "epoch": 0.52, + "learning_rate": 2.755536770921386e-07, + "logits/chosen": -1.1444514989852905, + "logits/rejected": -1.1056411266326904, + "logps/chosen": -86.2841796875, + "logps/rejected": -100.90594482421875, + "loss": 0.0068, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.109790325164795, + "rewards/margins": 17.33005142211914, + "rewards/rejected": -16.220258712768555, + "step": 1140 + }, + { + "epoch": 0.52, + "learning_rate": 2.750464919695689e-07, + "logits/chosen": -1.1471726894378662, + "logits/rejected": -1.1120365858078003, + "logps/chosen": -83.0753402709961, + "logps/rejected": -98.07260131835938, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3137190341949463, + "rewards/margins": 17.202226638793945, + "rewards/rejected": -15.888509750366211, + "step": 1150 + }, + { + "epoch": 0.53, + "learning_rate": 2.745393068469991e-07, + "logits/chosen": -1.131012201309204, + "logits/rejected": -1.0842430591583252, + "logps/chosen": -97.24982452392578, + "logps/rejected": -103.58902740478516, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.803784728050232, + "rewards/margins": 19.04859161376953, + "rewards/rejected": -17.244808197021484, + "step": 1160 + }, + { + "epoch": 0.53, + "learning_rate": 2.740321217244294e-07, + "logits/chosen": -1.1448577642440796, + "logits/rejected": -1.0983836650848389, + "logps/chosen": -88.2474136352539, + "logps/rejected": -101.29617309570312, + "loss": 0.0095, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.321990489959717, + "rewards/margins": 19.179903030395508, + "rewards/rejected": -16.857913970947266, + "step": 1170 + }, + { + "epoch": 0.54, + "learning_rate": 2.7352493660185967e-07, + "logits/chosen": -1.15381920337677, + "logits/rejected": -1.1216586828231812, + "logps/chosen": -79.86784362792969, + "logps/rejected": -101.373046875, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6416583061218262, + "rewards/margins": 18.733495712280273, + "rewards/rejected": -18.091835021972656, + "step": 1180 + }, + { + "epoch": 0.54, + "learning_rate": 2.7301775147928993e-07, + "logits/chosen": -1.1581026315689087, + "logits/rejected": -1.1127700805664062, + "logps/chosen": -95.01976013183594, + "logps/rejected": -102.31490325927734, + "loss": 0.0073, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.2217804193496704, + "rewards/margins": 18.29319953918457, + "rewards/rejected": -17.07141876220703, + "step": 1190 + }, + { + "epoch": 0.55, + "learning_rate": 2.725105663567202e-07, + "logits/chosen": -1.1556333303451538, + "logits/rejected": -1.1143467426300049, + "logps/chosen": -85.31793975830078, + "logps/rejected": -104.33609771728516, + "loss": 0.0154, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.109412670135498, + "rewards/margins": 19.068283081054688, + "rewards/rejected": -16.958871841430664, + "step": 1200 + }, + { + "epoch": 0.55, + "eval_logits/chosen": -1.2309268712997437, + "eval_logits/rejected": -1.177735447883606, + "eval_logps/chosen": -86.51016998291016, + "eval_logps/rejected": -97.42555236816406, + "eval_loss": 0.008077413775026798, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.4611291885375977, + "eval_rewards/margins": 17.5030517578125, + "eval_rewards/rejected": -16.041919708251953, + "eval_runtime": 61.5469, + "eval_samples_per_second": 46.501, + "eval_steps_per_second": 2.908, + "step": 1200 + }, + { + "epoch": 0.55, + "learning_rate": 2.7200338123415044e-07, + "logits/chosen": -1.1782100200653076, + "logits/rejected": -1.1402640342712402, + "logps/chosen": -92.4530029296875, + "logps/rejected": -104.04048919677734, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6389929056167603, + "rewards/margins": 18.719791412353516, + "rewards/rejected": -17.080801010131836, + "step": 1210 + }, + { + "epoch": 0.56, + "learning_rate": 2.714961961115807e-07, + "logits/chosen": -1.1656419038772583, + "logits/rejected": -1.1300376653671265, + "logps/chosen": -83.96723175048828, + "logps/rejected": -97.54676055908203, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.120603084564209, + "rewards/margins": 17.0549259185791, + "rewards/rejected": -15.934321403503418, + "step": 1220 + }, + { + "epoch": 0.56, + "learning_rate": 2.70989010989011e-07, + "logits/chosen": -1.181616187095642, + "logits/rejected": -1.140226125717163, + "logps/chosen": -86.73959350585938, + "logps/rejected": -100.60263061523438, + "loss": 0.0033, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.8291637897491455, + "rewards/margins": 17.071361541748047, + "rewards/rejected": -15.242197036743164, + "step": 1230 + }, + { + "epoch": 0.57, + "learning_rate": 2.7048182586644125e-07, + "logits/chosen": -1.1932631731033325, + "logits/rejected": -1.1404846906661987, + "logps/chosen": -89.65509796142578, + "logps/rejected": -100.72111511230469, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.12876558303833, + "rewards/margins": 18.558353424072266, + "rewards/rejected": -16.429588317871094, + "step": 1240 + }, + { + "epoch": 0.57, + "learning_rate": 2.699746407438715e-07, + "logits/chosen": -1.1967064142227173, + "logits/rejected": -1.1478387117385864, + "logps/chosen": -86.44863891601562, + "logps/rejected": -100.36338806152344, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.105329990386963, + "rewards/margins": 18.038101196289062, + "rewards/rejected": -16.932769775390625, + "step": 1250 + }, + { + "epoch": 0.58, + "learning_rate": 2.6946745562130176e-07, + "logits/chosen": -1.1940025091171265, + "logits/rejected": -1.1532320976257324, + "logps/chosen": -89.0478744506836, + "logps/rejected": -102.28709411621094, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9386533498764038, + "rewards/margins": 18.23946762084961, + "rewards/rejected": -16.30081558227539, + "step": 1260 + }, + { + "epoch": 0.58, + "learning_rate": 2.68960270498732e-07, + "logits/chosen": -1.1994531154632568, + "logits/rejected": -1.1430509090423584, + "logps/chosen": -89.60600280761719, + "logps/rejected": -107.12632751464844, + "loss": 0.0077, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8430372476577759, + "rewards/margins": 19.31827163696289, + "rewards/rejected": -18.475234985351562, + "step": 1270 + }, + { + "epoch": 0.58, + "learning_rate": 2.6845308537616227e-07, + "logits/chosen": -1.1602545976638794, + "logits/rejected": -1.1090123653411865, + "logps/chosen": -87.08576202392578, + "logps/rejected": -95.82630920410156, + "loss": 0.011, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.725023627281189, + "rewards/margins": 18.602710723876953, + "rewards/rejected": -16.877687454223633, + "step": 1280 + }, + { + "epoch": 0.59, + "learning_rate": 2.679459002535926e-07, + "logits/chosen": -1.2227842807769775, + "logits/rejected": -1.1634962558746338, + "logps/chosen": -88.16740417480469, + "logps/rejected": -97.47923278808594, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8518360257148743, + "rewards/margins": 17.11991310119629, + "rewards/rejected": -16.268077850341797, + "step": 1290 + }, + { + "epoch": 0.59, + "learning_rate": 2.674387151310228e-07, + "logits/chosen": -1.1858384609222412, + "logits/rejected": -1.1380422115325928, + "logps/chosen": -91.48844909667969, + "logps/rejected": -104.26606750488281, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4121227264404297, + "rewards/margins": 19.12758445739746, + "rewards/rejected": -16.71546173095703, + "step": 1300 + }, + { + "epoch": 0.59, + "eval_logits/chosen": -1.2335172891616821, + "eval_logits/rejected": -1.1754339933395386, + "eval_logps/chosen": -86.43448638916016, + "eval_logps/rejected": -99.26506042480469, + "eval_loss": 0.00815549585968256, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.4989758729934692, + "eval_rewards/margins": 18.460649490356445, + "eval_rewards/rejected": -16.961671829223633, + "eval_runtime": 59.4219, + "eval_samples_per_second": 48.164, + "eval_steps_per_second": 3.012, + "step": 1300 + }, + { + "epoch": 0.6, + "learning_rate": 2.669315300084531e-07, + "logits/chosen": -1.2054587602615356, + "logits/rejected": -1.1616264581680298, + "logps/chosen": -86.14714050292969, + "logps/rejected": -103.92906188964844, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1422611474990845, + "rewards/margins": 18.06179428100586, + "rewards/rejected": -16.919536590576172, + "step": 1310 + }, + { + "epoch": 0.6, + "learning_rate": 2.6642434488588334e-07, + "logits/chosen": -1.164485216140747, + "logits/rejected": -1.1202260255813599, + "logps/chosen": -83.26927947998047, + "logps/rejected": -103.29952239990234, + "loss": 0.0101, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3391577005386353, + "rewards/margins": 19.02712631225586, + "rewards/rejected": -17.68796730041504, + "step": 1320 + }, + { + "epoch": 0.61, + "learning_rate": 2.659171597633136e-07, + "logits/chosen": -1.1936196088790894, + "logits/rejected": -1.1440293788909912, + "logps/chosen": -84.86544036865234, + "logps/rejected": -104.18849182128906, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3678613901138306, + "rewards/margins": 19.402454376220703, + "rewards/rejected": -18.03459358215332, + "step": 1330 + }, + { + "epoch": 0.61, + "learning_rate": 2.6540997464074385e-07, + "logits/chosen": -1.1995398998260498, + "logits/rejected": -1.1528923511505127, + "logps/chosen": -83.30887603759766, + "logps/rejected": -100.70631408691406, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5668017864227295, + "rewards/margins": 20.389301300048828, + "rewards/rejected": -17.822498321533203, + "step": 1340 + }, + { + "epoch": 0.62, + "learning_rate": 2.649027895181741e-07, + "logits/chosen": -1.2100002765655518, + "logits/rejected": -1.1548488140106201, + "logps/chosen": -82.71726989746094, + "logps/rejected": -100.33686828613281, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1490261554718018, + "rewards/margins": 18.165735244750977, + "rewards/rejected": -17.016712188720703, + "step": 1350 + }, + { + "epoch": 0.62, + "learning_rate": 2.6439560439560436e-07, + "logits/chosen": -1.2042474746704102, + "logits/rejected": -1.1507916450500488, + "logps/chosen": -89.17523956298828, + "logps/rejected": -103.0042724609375, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0882587432861328, + "rewards/margins": 19.461795806884766, + "rewards/rejected": -18.37353515625, + "step": 1360 + }, + { + "epoch": 0.63, + "learning_rate": 2.6388841927303466e-07, + "logits/chosen": -1.19635808467865, + "logits/rejected": -1.1526683568954468, + "logps/chosen": -84.29436492919922, + "logps/rejected": -103.22540283203125, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6204719543457031, + "rewards/margins": 19.50061798095703, + "rewards/rejected": -17.880146026611328, + "step": 1370 + }, + { + "epoch": 0.63, + "learning_rate": 2.633812341504649e-07, + "logits/chosen": -1.1552878618240356, + "logits/rejected": -1.1150346994400024, + "logps/chosen": -84.24638366699219, + "logps/rejected": -101.41566467285156, + "loss": 0.0058, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.426559329032898, + "rewards/margins": 18.867931365966797, + "rewards/rejected": -17.44137191772461, + "step": 1380 + }, + { + "epoch": 0.63, + "learning_rate": 2.6287404902789517e-07, + "logits/chosen": -1.1951394081115723, + "logits/rejected": -1.1533236503601074, + "logps/chosen": -89.08964538574219, + "logps/rejected": -107.95294189453125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.272940754890442, + "rewards/margins": 19.47437858581543, + "rewards/rejected": -18.201440811157227, + "step": 1390 + }, + { + "epoch": 0.64, + "learning_rate": 2.623668639053254e-07, + "logits/chosen": -1.1739484071731567, + "logits/rejected": -1.1234138011932373, + "logps/chosen": -93.20767211914062, + "logps/rejected": -102.9843521118164, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4396759271621704, + "rewards/margins": 19.409513473510742, + "rewards/rejected": -17.969837188720703, + "step": 1400 + }, + { + "epoch": 0.64, + "eval_logits/chosen": -1.2195051908493042, + "eval_logits/rejected": -1.1641638278961182, + "eval_logps/chosen": -86.76068115234375, + "eval_logps/rejected": -102.28946685791016, + "eval_loss": 0.008062196895480156, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 1.3358712196350098, + "eval_rewards/margins": 19.80974578857422, + "eval_rewards/rejected": -18.473875045776367, + "eval_runtime": 69.0484, + "eval_samples_per_second": 41.449, + "eval_steps_per_second": 2.592, + "step": 1400 + }, + { + "epoch": 0.64, + "learning_rate": 2.618596787827557e-07, + "logits/chosen": -1.157814621925354, + "logits/rejected": -1.119381308555603, + "logps/chosen": -92.300048828125, + "logps/rejected": -103.40312194824219, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9399839639663696, + "rewards/margins": 19.74970054626465, + "rewards/rejected": -17.80971908569336, + "step": 1410 + }, + { + "epoch": 0.65, + "learning_rate": 2.6135249366018593e-07, + "logits/chosen": -1.1667792797088623, + "logits/rejected": -1.119706392288208, + "logps/chosen": -85.43004608154297, + "logps/rejected": -100.96659851074219, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.285947561264038, + "rewards/margins": 19.657182693481445, + "rewards/rejected": -17.371234893798828, + "step": 1420 + }, + { + "epoch": 0.65, + "learning_rate": 2.6084530853761624e-07, + "logits/chosen": -1.1735081672668457, + "logits/rejected": -1.1159073114395142, + "logps/chosen": -89.15231323242188, + "logps/rejected": -104.07325744628906, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.379710853099823, + "rewards/margins": 18.454669952392578, + "rewards/rejected": -18.07495880126953, + "step": 1430 + }, + { + "epoch": 0.66, + "learning_rate": 2.603381234150465e-07, + "logits/chosen": -1.177208423614502, + "logits/rejected": -1.1227295398712158, + "logps/chosen": -87.95960235595703, + "logps/rejected": -100.62553405761719, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4888887405395508, + "rewards/margins": 18.826932907104492, + "rewards/rejected": -17.338045120239258, + "step": 1440 + }, + { + "epoch": 0.66, + "learning_rate": 2.5983093829247675e-07, + "logits/chosen": -1.135980248451233, + "logits/rejected": -1.097318410873413, + "logps/chosen": -90.05815887451172, + "logps/rejected": -97.01565551757812, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6748930215835571, + "rewards/margins": 16.284954071044922, + "rewards/rejected": -14.61005973815918, + "step": 1450 + }, + { + "epoch": 0.67, + "learning_rate": 2.59323753169907e-07, + "logits/chosen": -1.1009352207183838, + "logits/rejected": -1.0712867975234985, + "logps/chosen": -90.10000610351562, + "logps/rejected": -104.41805267333984, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7969961762428284, + "rewards/margins": 18.065677642822266, + "rewards/rejected": -17.268680572509766, + "step": 1460 + }, + { + "epoch": 0.67, + "learning_rate": 2.5881656804733726e-07, + "logits/chosen": -1.1424516439437866, + "logits/rejected": -1.104478120803833, + "logps/chosen": -85.87166595458984, + "logps/rejected": -96.37669372558594, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2775685787200928, + "rewards/margins": 17.6835880279541, + "rewards/rejected": -16.40601921081543, + "step": 1470 + }, + { + "epoch": 0.68, + "learning_rate": 2.583093829247675e-07, + "logits/chosen": -1.148421049118042, + "logits/rejected": -1.1051350831985474, + "logps/chosen": -84.38792419433594, + "logps/rejected": -95.69989776611328, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0020928382873535, + "rewards/margins": 18.655029296875, + "rewards/rejected": -16.652935028076172, + "step": 1480 + }, + { + "epoch": 0.68, + "learning_rate": 2.5780219780219777e-07, + "logits/chosen": -1.137450933456421, + "logits/rejected": -1.1082595586776733, + "logps/chosen": -88.18030548095703, + "logps/rejected": -98.36376190185547, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1630921363830566, + "rewards/margins": 18.910762786865234, + "rewards/rejected": -16.747669219970703, + "step": 1490 + }, + { + "epoch": 0.68, + "learning_rate": 2.57295012679628e-07, + "logits/chosen": -1.144364595413208, + "logits/rejected": -1.1114692687988281, + "logps/chosen": -87.55670928955078, + "logps/rejected": -104.33062744140625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.494736909866333, + "rewards/margins": 19.5846004486084, + "rewards/rejected": -18.08986473083496, + "step": 1500 + }, + { + "epoch": 0.68, + "eval_logits/chosen": -1.2191152572631836, + "eval_logits/rejected": -1.1659319400787354, + "eval_logps/chosen": -87.01031494140625, + "eval_logps/rejected": -101.05850982666016, + "eval_loss": 0.00720271747559309, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.211055040359497, + "eval_rewards/margins": 19.069448471069336, + "eval_rewards/rejected": -17.85839080810547, + "eval_runtime": 77.7452, + "eval_samples_per_second": 36.813, + "eval_steps_per_second": 2.302, + "step": 1500 + }, + { + "epoch": 0.69, + "learning_rate": 2.5678782755705833e-07, + "logits/chosen": -1.1631323099136353, + "logits/rejected": -1.1225086450576782, + "logps/chosen": -82.05781555175781, + "logps/rejected": -102.92134094238281, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1484121084213257, + "rewards/margins": 19.43674087524414, + "rewards/rejected": -18.288328170776367, + "step": 1510 + }, + { + "epoch": 0.69, + "learning_rate": 2.562806424344886e-07, + "logits/chosen": -1.1722062826156616, + "logits/rejected": -1.1262253522872925, + "logps/chosen": -92.10721588134766, + "logps/rejected": -107.66226959228516, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5253825187683105, + "rewards/margins": 21.867202758789062, + "rewards/rejected": -19.341819763183594, + "step": 1520 + }, + { + "epoch": 0.7, + "learning_rate": 2.5577345731191884e-07, + "logits/chosen": -1.1755383014678955, + "logits/rejected": -1.1288068294525146, + "logps/chosen": -86.13807678222656, + "logps/rejected": -107.95841217041016, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1721524000167847, + "rewards/margins": 20.330608367919922, + "rewards/rejected": -19.158458709716797, + "step": 1530 + }, + { + "epoch": 0.7, + "learning_rate": 2.552662721893491e-07, + "logits/chosen": -1.1406983137130737, + "logits/rejected": -1.1065037250518799, + "logps/chosen": -88.02510070800781, + "logps/rejected": -102.81829833984375, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6564058661460876, + "rewards/margins": 18.354061126708984, + "rewards/rejected": -17.697656631469727, + "step": 1540 + }, + { + "epoch": 0.71, + "learning_rate": 2.5475908706677935e-07, + "logits/chosen": -1.1616684198379517, + "logits/rejected": -1.1152303218841553, + "logps/chosen": -85.58765411376953, + "logps/rejected": -105.52689361572266, + "loss": 0.0038, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9486013650894165, + "rewards/margins": 20.196699142456055, + "rewards/rejected": -19.24810028076172, + "step": 1550 + }, + { + "epoch": 0.71, + "learning_rate": 2.542519019442096e-07, + "logits/chosen": -1.2028621435165405, + "logits/rejected": -1.1485238075256348, + "logps/chosen": -78.52555847167969, + "logps/rejected": -99.6640853881836, + "loss": 0.0063, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5557008981704712, + "rewards/margins": 19.518081665039062, + "rewards/rejected": -17.962379455566406, + "step": 1560 + }, + { + "epoch": 0.72, + "learning_rate": 2.537447168216399e-07, + "logits/chosen": -1.176762342453003, + "logits/rejected": -1.1274961233139038, + "logps/chosen": -93.12080383300781, + "logps/rejected": -105.16368103027344, + "loss": 0.0031, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.659658432006836, + "rewards/margins": 19.342235565185547, + "rewards/rejected": -17.682575225830078, + "step": 1570 + }, + { + "epoch": 0.72, + "learning_rate": 2.5323753169907016e-07, + "logits/chosen": -1.2171311378479004, + "logits/rejected": -1.172149419784546, + "logps/chosen": -86.0450210571289, + "logps/rejected": -107.85368347167969, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6793642044067383, + "rewards/margins": 21.292896270751953, + "rewards/rejected": -19.613529205322266, + "step": 1580 + }, + { + "epoch": 0.73, + "learning_rate": 2.527303465765004e-07, + "logits/chosen": -1.2056636810302734, + "logits/rejected": -1.1607674360275269, + "logps/chosen": -93.74681854248047, + "logps/rejected": -113.48725891113281, + "loss": 0.0047, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6828597187995911, + "rewards/margins": 20.181041717529297, + "rewards/rejected": -19.49818229675293, + "step": 1590 + }, + { + "epoch": 0.73, + "learning_rate": 2.5222316145393067e-07, + "logits/chosen": -1.1712194681167603, + "logits/rejected": -1.124768853187561, + "logps/chosen": -96.73299407958984, + "logps/rejected": -108.51261901855469, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9574599266052246, + "rewards/margins": 20.438793182373047, + "rewards/rejected": -19.481334686279297, + "step": 1600 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -1.2923858165740967, + "eval_logits/rejected": -1.227616786956787, + "eval_logps/chosen": -87.92854309082031, + "eval_logps/rejected": -104.88460540771484, + "eval_loss": 0.007888087071478367, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 0.7519445419311523, + "eval_rewards/margins": 20.523387908935547, + "eval_rewards/rejected": -19.77144432067871, + "eval_runtime": 58.4414, + "eval_samples_per_second": 48.972, + "eval_steps_per_second": 3.063, + "step": 1600 + }, + { + "epoch": 0.73, + "learning_rate": 2.517159763313609e-07, + "logits/chosen": -1.2481939792633057, + "logits/rejected": -1.1921546459197998, + "logps/chosen": -90.30425262451172, + "logps/rejected": -106.55949401855469, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6353106498718262, + "rewards/margins": 21.251611709594727, + "rewards/rejected": -20.616300582885742, + "step": 1610 + }, + { + "epoch": 0.74, + "learning_rate": 2.512087912087912e-07, + "logits/chosen": -1.2304586172103882, + "logits/rejected": -1.173517107963562, + "logps/chosen": -92.87260437011719, + "logps/rejected": -110.1197280883789, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5774765610694885, + "rewards/margins": 20.693300247192383, + "rewards/rejected": -20.115821838378906, + "step": 1620 + }, + { + "epoch": 0.74, + "learning_rate": 2.507016060862215e-07, + "logits/chosen": -1.2224457263946533, + "logits/rejected": -1.1627413034439087, + "logps/chosen": -89.59088134765625, + "logps/rejected": -107.96586608886719, + "loss": 0.0049, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6187242865562439, + "rewards/margins": 21.216934204101562, + "rewards/rejected": -20.598209381103516, + "step": 1630 + }, + { + "epoch": 0.75, + "learning_rate": 2.501944209636517e-07, + "logits/chosen": -1.2341878414154053, + "logits/rejected": -1.1646558046340942, + "logps/chosen": -99.49878692626953, + "logps/rejected": -107.7359390258789, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47276169061660767, + "rewards/margins": 21.252347946166992, + "rewards/rejected": -20.779584884643555, + "step": 1640 + }, + { + "epoch": 0.75, + "learning_rate": 2.49687235841082e-07, + "logits/chosen": -1.2416927814483643, + "logits/rejected": -1.1858526468276978, + "logps/chosen": -92.1099624633789, + "logps/rejected": -109.5634765625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.229796051979065, + "rewards/margins": 20.57327651977539, + "rewards/rejected": -19.34347915649414, + "step": 1650 + }, + { + "epoch": 0.76, + "learning_rate": 2.4918005071851225e-07, + "logits/chosen": -1.2330695390701294, + "logits/rejected": -1.1908389329910278, + "logps/chosen": -87.75404357910156, + "logps/rejected": -106.69844055175781, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9370654821395874, + "rewards/margins": 20.381633758544922, + "rewards/rejected": -19.444568634033203, + "step": 1660 + }, + { + "epoch": 0.76, + "learning_rate": 2.486728655959425e-07, + "logits/chosen": -1.2374944686889648, + "logits/rejected": -1.178486943244934, + "logps/chosen": -88.52249145507812, + "logps/rejected": -102.99385070800781, + "loss": 0.0105, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8609209060668945, + "rewards/margins": 20.464916229248047, + "rewards/rejected": -19.6039981842041, + "step": 1670 + }, + { + "epoch": 0.77, + "learning_rate": 2.481656804733728e-07, + "logits/chosen": -1.228576421737671, + "logits/rejected": -1.1720635890960693, + "logps/chosen": -89.48531341552734, + "logps/rejected": -108.29301452636719, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3690917491912842, + "rewards/margins": 20.013029098510742, + "rewards/rejected": -18.643938064575195, + "step": 1680 + }, + { + "epoch": 0.77, + "learning_rate": 2.47658495350803e-07, + "logits/chosen": -1.2124004364013672, + "logits/rejected": -1.1609828472137451, + "logps/chosen": -84.43907928466797, + "logps/rejected": -104.23795318603516, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7291128635406494, + "rewards/margins": 20.71861457824707, + "rewards/rejected": -18.989501953125, + "step": 1690 + }, + { + "epoch": 0.78, + "learning_rate": 2.4715131022823327e-07, + "logits/chosen": -1.2294721603393555, + "logits/rejected": -1.1744955778121948, + "logps/chosen": -92.34876251220703, + "logps/rejected": -111.11873626708984, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.766455888748169, + "rewards/margins": 21.175350189208984, + "rewards/rejected": -20.408893585205078, + "step": 1700 + }, + { + "epoch": 0.78, + "eval_logits/chosen": -1.2879317998886108, + "eval_logits/rejected": -1.226698875427246, + "eval_logps/chosen": -87.46847534179688, + "eval_logps/rejected": -104.55460357666016, + "eval_loss": 0.007508194539695978, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.9819736480712891, + "eval_rewards/margins": 20.588415145874023, + "eval_rewards/rejected": -19.606443405151367, + "eval_runtime": 66.3677, + "eval_samples_per_second": 43.123, + "eval_steps_per_second": 2.697, + "step": 1700 + }, + { + "epoch": 0.78, + "learning_rate": 2.4664412510566357e-07, + "logits/chosen": -1.2202476263046265, + "logits/rejected": -1.158785104751587, + "logps/chosen": -89.9117202758789, + "logps/rejected": -105.97224426269531, + "loss": 0.0013, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9460952877998352, + "rewards/margins": 20.959388732910156, + "rewards/rejected": -20.013294219970703, + "step": 1710 + }, + { + "epoch": 0.79, + "learning_rate": 2.4613693998309383e-07, + "logits/chosen": -1.2056442499160767, + "logits/rejected": -1.1586743593215942, + "logps/chosen": -84.9280776977539, + "logps/rejected": -102.05850982666016, + "loss": 0.0051, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1483945846557617, + "rewards/margins": 20.313642501831055, + "rewards/rejected": -18.16524887084961, + "step": 1720 + }, + { + "epoch": 0.79, + "learning_rate": 2.456297548605241e-07, + "logits/chosen": -1.1853172779083252, + "logits/rejected": -1.146316409111023, + "logps/chosen": -85.19999694824219, + "logps/rejected": -104.33415222167969, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6200752258300781, + "rewards/margins": 19.86764907836914, + "rewards/rejected": -18.247573852539062, + "step": 1730 + }, + { + "epoch": 0.79, + "learning_rate": 2.4512256973795434e-07, + "logits/chosen": -1.1810106039047241, + "logits/rejected": -1.1348565816879272, + "logps/chosen": -88.91767120361328, + "logps/rejected": -105.30973815917969, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.908353567123413, + "rewards/margins": 20.9567928314209, + "rewards/rejected": -19.04844093322754, + "step": 1740 + }, + { + "epoch": 0.8, + "learning_rate": 2.446153846153846e-07, + "logits/chosen": -1.1730239391326904, + "logits/rejected": -1.1376618146896362, + "logps/chosen": -87.39840698242188, + "logps/rejected": -104.39476013183594, + "loss": 0.005, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5575692653656006, + "rewards/margins": 19.33660125732422, + "rewards/rejected": -17.779033660888672, + "step": 1750 + }, + { + "epoch": 0.8, + "learning_rate": 2.4410819949281484e-07, + "logits/chosen": -1.1946265697479248, + "logits/rejected": -1.1434904336929321, + "logps/chosen": -89.12718963623047, + "logps/rejected": -110.394287109375, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4408177435398102, + "rewards/margins": 19.224130630493164, + "rewards/rejected": -18.78331184387207, + "step": 1760 + }, + { + "epoch": 0.81, + "learning_rate": 2.4360101437024515e-07, + "logits/chosen": -1.1789695024490356, + "logits/rejected": -1.136823296546936, + "logps/chosen": -83.54109954833984, + "logps/rejected": -104.07505798339844, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6552903652191162, + "rewards/margins": 20.878345489501953, + "rewards/rejected": -19.223054885864258, + "step": 1770 + }, + { + "epoch": 0.81, + "learning_rate": 2.4309382924767535e-07, + "logits/chosen": -1.1888688802719116, + "logits/rejected": -1.1330959796905518, + "logps/chosen": -91.33379364013672, + "logps/rejected": -110.06937408447266, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4772006869316101, + "rewards/margins": 20.467754364013672, + "rewards/rejected": -19.990554809570312, + "step": 1780 + }, + { + "epoch": 0.82, + "learning_rate": 2.4258664412510566e-07, + "logits/chosen": -1.1968967914581299, + "logits/rejected": -1.1466515064239502, + "logps/chosen": -90.5532455444336, + "logps/rejected": -102.20117950439453, + "loss": 0.0105, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.0483832359313965, + "rewards/margins": 20.55087661743164, + "rewards/rejected": -18.502490997314453, + "step": 1790 + }, + { + "epoch": 0.82, + "learning_rate": 2.420794590025359e-07, + "logits/chosen": -1.1845229864120483, + "logits/rejected": -1.137880802154541, + "logps/chosen": -90.67330932617188, + "logps/rejected": -107.00489807128906, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5007214546203613, + "rewards/margins": 20.610965728759766, + "rewards/rejected": -19.110244750976562, + "step": 1800 + }, + { + "epoch": 0.82, + "eval_logits/chosen": -1.2606803178787231, + "eval_logits/rejected": -1.1984455585479736, + "eval_logps/chosen": -86.78810119628906, + "eval_logps/rejected": -103.85710906982422, + "eval_loss": 0.008221164345741272, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.322161316871643, + "eval_rewards/margins": 20.57985496520996, + "eval_rewards/rejected": -19.257694244384766, + "eval_runtime": 70.2981, + "eval_samples_per_second": 40.712, + "eval_steps_per_second": 2.546, + "step": 1800 + }, + { + "epoch": 0.83, + "learning_rate": 2.4157227387996617e-07, + "logits/chosen": -1.2245190143585205, + "logits/rejected": -1.1681041717529297, + "logps/chosen": -90.532470703125, + "logps/rejected": -104.7442398071289, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7430979609489441, + "rewards/margins": 20.217361450195312, + "rewards/rejected": -19.47426414489746, + "step": 1810 + }, + { + "epoch": 0.83, + "learning_rate": 2.410650887573965e-07, + "logits/chosen": -1.2046802043914795, + "logits/rejected": -1.153531789779663, + "logps/chosen": -89.37370300292969, + "logps/rejected": -104.24407958984375, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3622597455978394, + "rewards/margins": 19.505048751831055, + "rewards/rejected": -18.14278793334961, + "step": 1820 + }, + { + "epoch": 0.84, + "learning_rate": 2.405579036348267e-07, + "logits/chosen": -1.1867458820343018, + "logits/rejected": -1.1448475122451782, + "logps/chosen": -80.08723449707031, + "logps/rejected": -102.7812728881836, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2673819065093994, + "rewards/margins": 19.793813705444336, + "rewards/rejected": -18.526432037353516, + "step": 1830 + }, + { + "epoch": 0.84, + "learning_rate": 2.4005071851225693e-07, + "logits/chosen": -1.1724661588668823, + "logits/rejected": -1.1277903318405151, + "logps/chosen": -90.30950927734375, + "logps/rejected": -113.6275405883789, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5377770662307739, + "rewards/margins": 20.727577209472656, + "rewards/rejected": -20.189800262451172, + "step": 1840 + }, + { + "epoch": 0.84, + "learning_rate": 2.3954353338968724e-07, + "logits/chosen": -1.1979376077651978, + "logits/rejected": -1.155174970626831, + "logps/chosen": -83.19313049316406, + "logps/rejected": -104.564453125, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9587980508804321, + "rewards/margins": 20.022220611572266, + "rewards/rejected": -19.063425064086914, + "step": 1850 + }, + { + "epoch": 0.85, + "learning_rate": 2.390363482671175e-07, + "logits/chosen": -1.170663595199585, + "logits/rejected": -1.11752450466156, + "logps/chosen": -90.89318084716797, + "logps/rejected": -109.8968734741211, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.412500262260437, + "rewards/margins": 21.69009017944336, + "rewards/rejected": -20.277587890625, + "step": 1860 + }, + { + "epoch": 0.85, + "learning_rate": 2.3852916314454775e-07, + "logits/chosen": -1.1926937103271484, + "logits/rejected": -1.1473562717437744, + "logps/chosen": -93.6591796875, + "logps/rejected": -106.84416198730469, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8632802963256836, + "rewards/margins": 19.899490356445312, + "rewards/rejected": -18.036211013793945, + "step": 1870 + }, + { + "epoch": 0.86, + "learning_rate": 2.38021978021978e-07, + "logits/chosen": -1.1921271085739136, + "logits/rejected": -1.1413754224777222, + "logps/chosen": -85.18832397460938, + "logps/rejected": -105.3952407836914, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8996086120605469, + "rewards/margins": 20.485759735107422, + "rewards/rejected": -19.586151123046875, + "step": 1880 + }, + { + "epoch": 0.86, + "learning_rate": 2.3751479289940826e-07, + "logits/chosen": -1.2049638032913208, + "logits/rejected": -1.1474329233169556, + "logps/chosen": -92.87411499023438, + "logps/rejected": -106.48674011230469, + "loss": 0.0039, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1706862449645996, + "rewards/margins": 22.162261962890625, + "rewards/rejected": -19.991575241088867, + "step": 1890 + }, + { + "epoch": 0.87, + "learning_rate": 2.3700760777683854e-07, + "logits/chosen": -1.1745494604110718, + "logits/rejected": -1.1166932582855225, + "logps/chosen": -90.28245544433594, + "logps/rejected": -111.68940734863281, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.081857204437256, + "rewards/margins": 21.467239379882812, + "rewards/rejected": -19.385379791259766, + "step": 1900 + }, + { + "epoch": 0.87, + "eval_logits/chosen": -1.2352814674377441, + "eval_logits/rejected": -1.1811531782150269, + "eval_logps/chosen": -86.63172149658203, + "eval_logps/rejected": -104.2937240600586, + "eval_loss": 0.007481275591999292, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": 1.4003478288650513, + "eval_rewards/margins": 20.87635612487793, + "eval_rewards/rejected": -19.47600746154785, + "eval_runtime": 64.2566, + "eval_samples_per_second": 44.54, + "eval_steps_per_second": 2.786, + "step": 1900 + }, + { + "epoch": 0.87, + "learning_rate": 2.3650042265426882e-07, + "logits/chosen": -1.2035939693450928, + "logits/rejected": -1.1619645357131958, + "logps/chosen": -87.07538604736328, + "logps/rejected": -105.97283935546875, + "loss": 0.0035, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9201610088348389, + "rewards/margins": 21.46194839477539, + "rewards/rejected": -19.54178810119629, + "step": 1910 + }, + { + "epoch": 0.88, + "learning_rate": 2.3599323753169907e-07, + "logits/chosen": -1.1762607097625732, + "logits/rejected": -1.1266227960586548, + "logps/chosen": -84.24024200439453, + "logps/rejected": -105.28382873535156, + "loss": 0.0088, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7776741981506348, + "rewards/margins": 21.163555145263672, + "rewards/rejected": -20.38587760925293, + "step": 1920 + }, + { + "epoch": 0.88, + "learning_rate": 2.354860524091293e-07, + "logits/chosen": -1.196942925453186, + "logits/rejected": -1.1432682275772095, + "logps/chosen": -95.1038818359375, + "logps/rejected": -106.1786117553711, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3159481287002563, + "rewards/margins": 21.558012008666992, + "rewards/rejected": -20.2420654296875, + "step": 1930 + }, + { + "epoch": 0.89, + "learning_rate": 2.3497886728655958e-07, + "logits/chosen": -1.1665886640548706, + "logits/rejected": -1.1258021593093872, + "logps/chosen": -83.95616149902344, + "logps/rejected": -100.2303237915039, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.901753306388855, + "rewards/margins": 19.35708999633789, + "rewards/rejected": -18.455339431762695, + "step": 1940 + }, + { + "epoch": 0.89, + "learning_rate": 2.3447168216398983e-07, + "logits/chosen": -1.18831205368042, + "logits/rejected": -1.1461542844772339, + "logps/chosen": -89.49617767333984, + "logps/rejected": -105.2798843383789, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2909132242202759, + "rewards/margins": 20.118661880493164, + "rewards/rejected": -18.827747344970703, + "step": 1950 + }, + { + "epoch": 0.89, + "learning_rate": 2.3396449704142012e-07, + "logits/chosen": -1.1922476291656494, + "logits/rejected": -1.1407110691070557, + "logps/chosen": -94.23783874511719, + "logps/rejected": -105.92415618896484, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5083615183830261, + "rewards/margins": 19.7877254486084, + "rewards/rejected": -19.279361724853516, + "step": 1960 + }, + { + "epoch": 0.9, + "learning_rate": 2.334573119188504e-07, + "logits/chosen": -1.1829791069030762, + "logits/rejected": -1.1273462772369385, + "logps/chosen": -88.82598876953125, + "logps/rejected": -101.88629150390625, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.235897183418274, + "rewards/margins": 21.914575576782227, + "rewards/rejected": -20.67867660522461, + "step": 1970 + }, + { + "epoch": 0.9, + "learning_rate": 2.3295012679628062e-07, + "logits/chosen": -1.2115113735198975, + "logits/rejected": -1.1657658815383911, + "logps/chosen": -86.12489318847656, + "logps/rejected": -107.81489562988281, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.653774082660675, + "rewards/margins": 20.462425231933594, + "rewards/rejected": -19.808650970458984, + "step": 1980 + }, + { + "epoch": 0.91, + "learning_rate": 2.3244294167371088e-07, + "logits/chosen": -1.2142908573150635, + "logits/rejected": -1.1777855157852173, + "logps/chosen": -84.75012969970703, + "logps/rejected": -106.76557922363281, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0658010244369507, + "rewards/margins": 20.134159088134766, + "rewards/rejected": -19.068355560302734, + "step": 1990 + }, + { + "epoch": 0.91, + "learning_rate": 2.3193575655114116e-07, + "logits/chosen": -1.2295953035354614, + "logits/rejected": -1.174081802368164, + "logps/chosen": -88.81068420410156, + "logps/rejected": -106.81929779052734, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5790882110595703, + "rewards/margins": 20.7359619140625, + "rewards/rejected": -19.156875610351562, + "step": 2000 + }, + { + "epoch": 0.91, + "eval_logits/chosen": -1.2681105136871338, + "eval_logits/rejected": -1.2061457633972168, + "eval_logps/chosen": -86.66743469238281, + "eval_logps/rejected": -104.74527740478516, + "eval_loss": 0.007192350458353758, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.3824937343597412, + "eval_rewards/margins": 21.084270477294922, + "eval_rewards/rejected": -19.7017765045166, + "eval_runtime": 77.6552, + "eval_samples_per_second": 36.855, + "eval_steps_per_second": 2.305, + "step": 2000 + }, + { + "epoch": 0.92, + "learning_rate": 2.3142857142857144e-07, + "logits/chosen": -1.194087266921997, + "logits/rejected": -1.1440551280975342, + "logps/chosen": -87.83335876464844, + "logps/rejected": -106.5591812133789, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5759541988372803, + "rewards/margins": 20.1513614654541, + "rewards/rejected": -18.575407028198242, + "step": 2010 + }, + { + "epoch": 0.92, + "learning_rate": 2.3092138630600167e-07, + "logits/chosen": -1.1671911478042603, + "logits/rejected": -1.1248931884765625, + "logps/chosen": -82.89833068847656, + "logps/rejected": -110.85752868652344, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.971392035484314, + "rewards/margins": 20.262601852416992, + "rewards/rejected": -19.291210174560547, + "step": 2020 + }, + { + "epoch": 0.93, + "learning_rate": 2.3041420118343192e-07, + "logits/chosen": -1.1789249181747437, + "logits/rejected": -1.1308996677398682, + "logps/chosen": -85.59063720703125, + "logps/rejected": -98.32166290283203, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9662765264511108, + "rewards/margins": 20.44546890258789, + "rewards/rejected": -18.479190826416016, + "step": 2030 + }, + { + "epoch": 0.93, + "learning_rate": 2.299070160608622e-07, + "logits/chosen": -1.1901233196258545, + "logits/rejected": -1.1428929567337036, + "logps/chosen": -83.4649887084961, + "logps/rejected": -101.49374389648438, + "loss": 0.0036, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.49424687027931213, + "rewards/margins": 19.59086036682129, + "rewards/rejected": -19.09661293029785, + "step": 2040 + }, + { + "epoch": 0.94, + "learning_rate": 2.2939983093829248e-07, + "logits/chosen": -1.211072325706482, + "logits/rejected": -1.1505995988845825, + "logps/chosen": -92.88377380371094, + "logps/rejected": -105.16926574707031, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.585486650466919, + "rewards/margins": 20.312013626098633, + "rewards/rejected": -18.72652816772461, + "step": 2050 + }, + { + "epoch": 0.94, + "learning_rate": 2.2889264581572274e-07, + "logits/chosen": -1.1979122161865234, + "logits/rejected": -1.1408374309539795, + "logps/chosen": -88.95879364013672, + "logps/rejected": -100.50065612792969, + "loss": 0.0088, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.2848825454711914, + "rewards/margins": 20.595218658447266, + "rewards/rejected": -18.31033706665039, + "step": 2060 + }, + { + "epoch": 0.94, + "learning_rate": 2.2838546069315297e-07, + "logits/chosen": -1.1775312423706055, + "logits/rejected": -1.1326141357421875, + "logps/chosen": -83.72702026367188, + "logps/rejected": -104.96478271484375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2204118967056274, + "rewards/margins": 20.910999298095703, + "rewards/rejected": -19.69058609008789, + "step": 2070 + }, + { + "epoch": 0.95, + "learning_rate": 2.2787827557058325e-07, + "logits/chosen": -1.1710788011550903, + "logits/rejected": -1.1334991455078125, + "logps/chosen": -84.7524185180664, + "logps/rejected": -103.06710052490234, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.717494249343872, + "rewards/margins": 20.584232330322266, + "rewards/rejected": -18.866741180419922, + "step": 2080 + }, + { + "epoch": 0.95, + "learning_rate": 2.273710904480135e-07, + "logits/chosen": -1.1447703838348389, + "logits/rejected": -1.1037436723709106, + "logps/chosen": -88.0915298461914, + "logps/rejected": -101.42835998535156, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3874223232269287, + "rewards/margins": 19.40856170654297, + "rewards/rejected": -18.02113914489746, + "step": 2090 + }, + { + "epoch": 0.96, + "learning_rate": 2.2686390532544378e-07, + "logits/chosen": -1.1998827457427979, + "logits/rejected": -1.144376277923584, + "logps/chosen": -87.5056381225586, + "logps/rejected": -100.17271423339844, + "loss": 0.0066, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.228088855743408, + "rewards/margins": 20.235179901123047, + "rewards/rejected": -18.007089614868164, + "step": 2100 + }, + { + "epoch": 0.96, + "eval_logits/chosen": -1.2238779067993164, + "eval_logits/rejected": -1.1694415807724, + "eval_logps/chosen": -86.23553466796875, + "eval_logps/rejected": -102.94633483886719, + "eval_loss": 0.0070498245768249035, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.5984489917755127, + "eval_rewards/margins": 20.400758743286133, + "eval_rewards/rejected": -18.802310943603516, + "eval_runtime": 62.0998, + "eval_samples_per_second": 46.087, + "eval_steps_per_second": 2.882, + "step": 2100 + }, + { + "epoch": 0.96, + "learning_rate": 2.2635672020287406e-07, + "logits/chosen": -1.1710069179534912, + "logits/rejected": -1.1208176612854004, + "logps/chosen": -89.8357162475586, + "logps/rejected": -104.9393310546875, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2994686365127563, + "rewards/margins": 20.46290397644043, + "rewards/rejected": -19.163436889648438, + "step": 2110 + }, + { + "epoch": 0.97, + "learning_rate": 2.258495350803043e-07, + "logits/chosen": -1.167830228805542, + "logits/rejected": -1.1247303485870361, + "logps/chosen": -88.80085754394531, + "logps/rejected": -101.0454330444336, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7379745244979858, + "rewards/margins": 20.091503143310547, + "rewards/rejected": -18.353527069091797, + "step": 2120 + }, + { + "epoch": 0.97, + "learning_rate": 2.2534234995773454e-07, + "logits/chosen": -1.1950472593307495, + "logits/rejected": -1.144486427307129, + "logps/chosen": -91.32661437988281, + "logps/rejected": -110.21795654296875, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46089300513267517, + "rewards/margins": 21.63949966430664, + "rewards/rejected": -21.178604125976562, + "step": 2130 + }, + { + "epoch": 0.98, + "learning_rate": 2.2483516483516483e-07, + "logits/chosen": -1.1894288063049316, + "logits/rejected": -1.1427417993545532, + "logps/chosen": -85.63072967529297, + "logps/rejected": -102.2844467163086, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9994566440582275, + "rewards/margins": 21.046451568603516, + "rewards/rejected": -19.0469913482666, + "step": 2140 + }, + { + "epoch": 0.98, + "learning_rate": 2.243279797125951e-07, + "logits/chosen": -1.1913617849349976, + "logits/rejected": -1.1351690292358398, + "logps/chosen": -86.342529296875, + "logps/rejected": -105.4500503540039, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3321704864501953, + "rewards/margins": 20.112749099731445, + "rewards/rejected": -18.780582427978516, + "step": 2150 + }, + { + "epoch": 0.99, + "learning_rate": 2.2382079459002536e-07, + "logits/chosen": -1.200955867767334, + "logits/rejected": -1.141643762588501, + "logps/chosen": -93.14427185058594, + "logps/rejected": -100.25897979736328, + "loss": 0.0028, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7122005224227905, + "rewards/margins": 20.42462158203125, + "rewards/rejected": -18.712421417236328, + "step": 2160 + }, + { + "epoch": 0.99, + "learning_rate": 2.233136094674556e-07, + "logits/chosen": -1.2126085758209229, + "logits/rejected": -1.1542718410491943, + "logps/chosen": -94.84847259521484, + "logps/rejected": -105.94117736816406, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.69661545753479, + "rewards/margins": 20.98930549621582, + "rewards/rejected": -19.29269027709961, + "step": 2170 + }, + { + "epoch": 0.99, + "learning_rate": 2.2280642434488587e-07, + "logits/chosen": -1.2024385929107666, + "logits/rejected": -1.15294349193573, + "logps/chosen": -81.07176208496094, + "logps/rejected": -103.16294860839844, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.119349479675293, + "rewards/margins": 21.580371856689453, + "rewards/rejected": -19.461023330688477, + "step": 2180 + }, + { + "epoch": 1.0, + "learning_rate": 2.2229923922231615e-07, + "logits/chosen": -1.1685454845428467, + "logits/rejected": -1.1278364658355713, + "logps/chosen": -91.21316528320312, + "logps/rejected": -108.0694580078125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.229431629180908, + "rewards/margins": 21.74981689453125, + "rewards/rejected": -19.5203857421875, + "step": 2190 + }, + { + "epoch": 1.0, + "learning_rate": 2.217920540997464e-07, + "logits/chosen": -1.1836879253387451, + "logits/rejected": -1.1423550844192505, + "logps/chosen": -88.64768981933594, + "logps/rejected": -109.5053482055664, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5388556718826294, + "rewards/margins": 20.56551742553711, + "rewards/rejected": -20.026660919189453, + "step": 2200 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -1.2711361646652222, + "eval_logits/rejected": -1.2118490934371948, + "eval_logps/chosen": -86.72749328613281, + "eval_logps/rejected": -104.70652770996094, + "eval_loss": 0.0074717202223837376, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": 1.352469801902771, + "eval_rewards/margins": 21.034873962402344, + "eval_rewards/rejected": -19.682405471801758, + "eval_runtime": 64.1917, + "eval_samples_per_second": 44.585, + "eval_steps_per_second": 2.789, + "step": 2200 + }, + { + "epoch": 1.01, + "learning_rate": 2.2128486897717668e-07, + "logits/chosen": -1.2156295776367188, + "logits/rejected": -1.1702911853790283, + "logps/chosen": -84.73490142822266, + "logps/rejected": -109.97856140136719, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.8699300289154053, + "rewards/margins": 22.461772918701172, + "rewards/rejected": -20.591842651367188, + "step": 2210 + }, + { + "epoch": 1.01, + "learning_rate": 2.207776838546069e-07, + "logits/chosen": -1.2317909002304077, + "logits/rejected": -1.1759793758392334, + "logps/chosen": -88.38722229003906, + "logps/rejected": -110.55941009521484, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2832095623016357, + "rewards/margins": 21.7716007232666, + "rewards/rejected": -19.488391876220703, + "step": 2220 + }, + { + "epoch": 1.02, + "learning_rate": 2.202704987320372e-07, + "logits/chosen": -1.240782618522644, + "logits/rejected": -1.1896260976791382, + "logps/chosen": -90.6281509399414, + "logps/rejected": -107.0479736328125, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5835065841674805, + "rewards/margins": 21.230846405029297, + "rewards/rejected": -19.6473388671875, + "step": 2230 + }, + { + "epoch": 1.02, + "learning_rate": 2.1976331360946745e-07, + "logits/chosen": -1.2149629592895508, + "logits/rejected": -1.1640093326568604, + "logps/chosen": -90.72506713867188, + "logps/rejected": -106.3701171875, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3114402294158936, + "rewards/margins": 21.860370635986328, + "rewards/rejected": -20.54892921447754, + "step": 2240 + }, + { + "epoch": 1.03, + "learning_rate": 2.1925612848689773e-07, + "logits/chosen": -1.207849383354187, + "logits/rejected": -1.1694178581237793, + "logps/chosen": -88.57085418701172, + "logps/rejected": -107.42121887207031, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3758482933044434, + "rewards/margins": 21.30167007446289, + "rewards/rejected": -19.925823211669922, + "step": 2250 + }, + { + "epoch": 1.03, + "learning_rate": 2.1874894336432796e-07, + "logits/chosen": -1.221719741821289, + "logits/rejected": -1.1561182737350464, + "logps/chosen": -92.80986022949219, + "logps/rejected": -109.31239318847656, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.313194513320923, + "rewards/margins": 23.222518920898438, + "rewards/rejected": -20.909326553344727, + "step": 2260 + }, + { + "epoch": 1.04, + "learning_rate": 2.182417582417582e-07, + "logits/chosen": -1.2283899784088135, + "logits/rejected": -1.1857209205627441, + "logps/chosen": -86.95245361328125, + "logps/rejected": -110.2240219116211, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5342823266983032, + "rewards/margins": 21.846160888671875, + "rewards/rejected": -20.311880111694336, + "step": 2270 + }, + { + "epoch": 1.04, + "learning_rate": 2.177345731191885e-07, + "logits/chosen": -1.2207567691802979, + "logits/rejected": -1.1733354330062866, + "logps/chosen": -89.40998077392578, + "logps/rejected": -108.59842681884766, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.084045648574829, + "rewards/margins": 21.383882522583008, + "rewards/rejected": -20.29983901977539, + "step": 2280 + }, + { + "epoch": 1.05, + "learning_rate": 2.1722738799661877e-07, + "logits/chosen": -1.2484939098358154, + "logits/rejected": -1.186006784439087, + "logps/chosen": -91.48284149169922, + "logps/rejected": -102.3342514038086, + "loss": 0.0071, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.3055270910263062, + "rewards/margins": 20.427013397216797, + "rewards/rejected": -19.12148666381836, + "step": 2290 + }, + { + "epoch": 1.05, + "learning_rate": 2.1672020287404903e-07, + "logits/chosen": -1.2090219259262085, + "logits/rejected": -1.1676021814346313, + "logps/chosen": -89.25651550292969, + "logps/rejected": -110.95772552490234, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.608730673789978, + "rewards/margins": 20.88467025756836, + "rewards/rejected": -20.275938034057617, + "step": 2300 + }, + { + "epoch": 1.05, + "eval_logits/chosen": -1.3087431192398071, + "eval_logits/rejected": -1.2488420009613037, + "eval_logps/chosen": -87.20025634765625, + "eval_logps/rejected": -105.1304702758789, + "eval_loss": 0.007476452272385359, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.1160887479782104, + "eval_rewards/margins": 21.010465621948242, + "eval_rewards/rejected": -19.894376754760742, + "eval_runtime": 68.119, + "eval_samples_per_second": 42.015, + "eval_steps_per_second": 2.628, + "step": 2300 + }, + { + "epoch": 1.05, + "learning_rate": 2.1621301775147925e-07, + "logits/chosen": -1.2305896282196045, + "logits/rejected": -1.1756011247634888, + "logps/chosen": -85.5489501953125, + "logps/rejected": -109.3161849975586, + "loss": 0.0017, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7630363702774048, + "rewards/margins": 21.09680938720703, + "rewards/rejected": -20.333772659301758, + "step": 2310 + }, + { + "epoch": 1.06, + "learning_rate": 2.1570583262890953e-07, + "logits/chosen": -1.2177748680114746, + "logits/rejected": -1.1697107553482056, + "logps/chosen": -85.29845428466797, + "logps/rejected": -109.9201889038086, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4945674538612366, + "rewards/margins": 21.288578033447266, + "rewards/rejected": -20.794010162353516, + "step": 2320 + }, + { + "epoch": 1.06, + "learning_rate": 2.1519864750633982e-07, + "logits/chosen": -1.2171680927276611, + "logits/rejected": -1.1630264520645142, + "logps/chosen": -88.26265716552734, + "logps/rejected": -104.31333923339844, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4045048952102661, + "rewards/margins": 20.78306770324707, + "rewards/rejected": -19.378559112548828, + "step": 2330 + }, + { + "epoch": 1.07, + "learning_rate": 2.1469146238377007e-07, + "logits/chosen": -1.2630029916763306, + "logits/rejected": -1.2106314897537231, + "logps/chosen": -88.36056518554688, + "logps/rejected": -110.58036041259766, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7803552150726318, + "rewards/margins": 22.81916046142578, + "rewards/rejected": -21.038806915283203, + "step": 2340 + }, + { + "epoch": 1.07, + "learning_rate": 2.1418427726120035e-07, + "logits/chosen": -1.2328459024429321, + "logits/rejected": -1.1753827333450317, + "logps/chosen": -90.98271942138672, + "logps/rejected": -107.30308532714844, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.918079137802124, + "rewards/margins": 22.566545486450195, + "rewards/rejected": -20.648466110229492, + "step": 2350 + }, + { + "epoch": 1.08, + "learning_rate": 2.1367709213863058e-07, + "logits/chosen": -1.221885085105896, + "logits/rejected": -1.1747316122055054, + "logps/chosen": -83.33541870117188, + "logps/rejected": -107.0094985961914, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8313859701156616, + "rewards/margins": 22.628454208374023, + "rewards/rejected": -20.797067642211914, + "step": 2360 + }, + { + "epoch": 1.08, + "learning_rate": 2.1316990701606086e-07, + "logits/chosen": -1.2495050430297852, + "logits/rejected": -1.1890289783477783, + "logps/chosen": -86.03840637207031, + "logps/rejected": -112.093505859375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5868552923202515, + "rewards/margins": 22.05801010131836, + "rewards/rejected": -21.471149444580078, + "step": 2370 + }, + { + "epoch": 1.09, + "learning_rate": 2.126627218934911e-07, + "logits/chosen": -1.2298763990402222, + "logits/rejected": -1.1767555475234985, + "logps/chosen": -84.1092758178711, + "logps/rejected": -106.68165588378906, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08795030415058136, + "rewards/margins": 21.050338745117188, + "rewards/rejected": -20.962390899658203, + "step": 2380 + }, + { + "epoch": 1.09, + "learning_rate": 2.121555367709214e-07, + "logits/chosen": -1.2427847385406494, + "logits/rejected": -1.2060487270355225, + "logps/chosen": -85.17793273925781, + "logps/rejected": -109.41981506347656, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06969626992940903, + "rewards/margins": 20.437192916870117, + "rewards/rejected": -20.367496490478516, + "step": 2390 + }, + { + "epoch": 1.1, + "learning_rate": 2.1164835164835165e-07, + "logits/chosen": -1.223024845123291, + "logits/rejected": -1.1713087558746338, + "logps/chosen": -85.1490707397461, + "logps/rejected": -114.04695129394531, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0163342952728271, + "rewards/margins": 22.59818458557129, + "rewards/rejected": -21.581851959228516, + "step": 2400 + }, + { + "epoch": 1.1, + "eval_logits/chosen": -1.3130491971969604, + "eval_logits/rejected": -1.2457395792007446, + "eval_logps/chosen": -87.72120666503906, + "eval_logps/rejected": -107.27316284179688, + "eval_loss": 0.007567834109067917, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": 0.8556116223335266, + "eval_rewards/margins": 21.821340560913086, + "eval_rewards/rejected": -20.965728759765625, + "eval_runtime": 68.7759, + "eval_samples_per_second": 41.613, + "eval_steps_per_second": 2.603, + "step": 2400 + }, + { + "epoch": 1.1, + "learning_rate": 2.1114116652578188e-07, + "logits/chosen": -1.224714756011963, + "logits/rejected": -1.1649423837661743, + "logps/chosen": -91.27989959716797, + "logps/rejected": -109.84356689453125, + "loss": 0.0066, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8775809407234192, + "rewards/margins": 22.14706039428711, + "rewards/rejected": -21.26947784423828, + "step": 2410 + }, + { + "epoch": 1.1, + "learning_rate": 2.1063398140321216e-07, + "logits/chosen": -1.2352392673492432, + "logits/rejected": -1.1807770729064941, + "logps/chosen": -86.58528137207031, + "logps/rejected": -107.39393615722656, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1014773845672607, + "rewards/margins": 22.366958618164062, + "rewards/rejected": -21.265480041503906, + "step": 2420 + }, + { + "epoch": 1.11, + "learning_rate": 2.1012679628064244e-07, + "logits/chosen": -1.271728515625, + "logits/rejected": -1.218794584274292, + "logps/chosen": -83.79855346679688, + "logps/rejected": -115.31538391113281, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1647331714630127, + "rewards/margins": 23.370630264282227, + "rewards/rejected": -22.205896377563477, + "step": 2430 + }, + { + "epoch": 1.11, + "learning_rate": 2.096196111580727e-07, + "logits/chosen": -1.2588279247283936, + "logits/rejected": -1.1938583850860596, + "logps/chosen": -92.08492279052734, + "logps/rejected": -108.75785827636719, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8567431569099426, + "rewards/margins": 22.12790298461914, + "rewards/rejected": -21.271160125732422, + "step": 2440 + }, + { + "epoch": 1.12, + "learning_rate": 2.0911242603550297e-07, + "logits/chosen": -1.254792332649231, + "logits/rejected": -1.184197187423706, + "logps/chosen": -92.77003479003906, + "logps/rejected": -114.69380950927734, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.006107807159424, + "rewards/margins": 24.762887954711914, + "rewards/rejected": -22.756778717041016, + "step": 2450 + }, + { + "epoch": 1.12, + "learning_rate": 2.086052409129332e-07, + "logits/chosen": -1.2276710271835327, + "logits/rejected": -1.172614336013794, + "logps/chosen": -92.51925659179688, + "logps/rejected": -109.40245056152344, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4506779909133911, + "rewards/margins": 22.675724029541016, + "rewards/rejected": -21.225048065185547, + "step": 2460 + }, + { + "epoch": 1.13, + "learning_rate": 2.0809805579036348e-07, + "logits/chosen": -1.2417436838150024, + "logits/rejected": -1.1916046142578125, + "logps/chosen": -87.77659606933594, + "logps/rejected": -107.12300109863281, + "loss": 0.0015, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3042961359024048, + "rewards/margins": 21.55357551574707, + "rewards/rejected": -20.249279022216797, + "step": 2470 + }, + { + "epoch": 1.13, + "learning_rate": 2.0759087066779374e-07, + "logits/chosen": -1.2384653091430664, + "logits/rejected": -1.19193434715271, + "logps/chosen": -87.645751953125, + "logps/rejected": -111.66351318359375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5307129621505737, + "rewards/margins": 22.680551528930664, + "rewards/rejected": -22.149839401245117, + "step": 2480 + }, + { + "epoch": 1.14, + "learning_rate": 2.0708368554522402e-07, + "logits/chosen": -1.239863634109497, + "logits/rejected": -1.1838042736053467, + "logps/chosen": -82.25237274169922, + "logps/rejected": -107.54081726074219, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.317577600479126, + "rewards/margins": 21.645660400390625, + "rewards/rejected": -20.328083038330078, + "step": 2490 + }, + { + "epoch": 1.14, + "learning_rate": 2.0657650042265424e-07, + "logits/chosen": -1.238755464553833, + "logits/rejected": -1.1892435550689697, + "logps/chosen": -91.93738555908203, + "logps/rejected": -105.78620910644531, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8200355768203735, + "rewards/margins": 21.127065658569336, + "rewards/rejected": -20.307029724121094, + "step": 2500 + }, + { + "epoch": 1.14, + "eval_logits/chosen": -1.2944495677947998, + "eval_logits/rejected": -1.2309415340423584, + "eval_logps/chosen": -87.02932739257812, + "eval_logps/rejected": -104.29837799072266, + "eval_loss": 0.006961911916732788, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": 1.2015457153320312, + "eval_rewards/margins": 20.67987823486328, + "eval_rewards/rejected": -19.478334426879883, + "eval_runtime": 71.1702, + "eval_samples_per_second": 40.213, + "eval_steps_per_second": 2.515, + "step": 2500 + }, + { + "epoch": 1.15, + "learning_rate": 2.0606931530008452e-07, + "logits/chosen": -1.2269400358200073, + "logits/rejected": -1.165895700454712, + "logps/chosen": -89.00814056396484, + "logps/rejected": -106.82181549072266, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1590993404388428, + "rewards/margins": 21.093416213989258, + "rewards/rejected": -19.934314727783203, + "step": 2510 + }, + { + "epoch": 1.15, + "learning_rate": 2.0556213017751478e-07, + "logits/chosen": -1.244373083114624, + "logits/rejected": -1.1799051761627197, + "logps/chosen": -86.1784896850586, + "logps/rejected": -108.1005859375, + "loss": 0.0047, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9702383279800415, + "rewards/margins": 21.26332664489746, + "rewards/rejected": -20.293087005615234, + "step": 2520 + }, + { + "epoch": 1.15, + "learning_rate": 2.0505494505494506e-07, + "logits/chosen": -1.239007830619812, + "logits/rejected": -1.188663363456726, + "logps/chosen": -89.08403015136719, + "logps/rejected": -109.1912612915039, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8398208618164062, + "rewards/margins": 20.724300384521484, + "rewards/rejected": -19.884479522705078, + "step": 2530 + }, + { + "epoch": 1.16, + "learning_rate": 2.0454775993237531e-07, + "logits/chosen": -1.259961485862732, + "logits/rejected": -1.1992539167404175, + "logps/chosen": -88.00000762939453, + "logps/rejected": -105.28775787353516, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7344595193862915, + "rewards/margins": 22.018203735351562, + "rewards/rejected": -20.283742904663086, + "step": 2540 + }, + { + "epoch": 1.16, + "learning_rate": 2.0404057480980554e-07, + "logits/chosen": -1.258159875869751, + "logits/rejected": -1.1919949054718018, + "logps/chosen": -91.0265121459961, + "logps/rejected": -109.3829345703125, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2989047765731812, + "rewards/margins": 22.686710357666016, + "rewards/rejected": -21.387805938720703, + "step": 2550 + }, + { + "epoch": 1.17, + "learning_rate": 2.0353338968723582e-07, + "logits/chosen": -1.216018557548523, + "logits/rejected": -1.1775795221328735, + "logps/chosen": -84.73078918457031, + "logps/rejected": -113.40096282958984, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12488274276256561, + "rewards/margins": 21.136188507080078, + "rewards/rejected": -21.261070251464844, + "step": 2560 + }, + { + "epoch": 1.17, + "learning_rate": 2.030262045646661e-07, + "logits/chosen": -1.2377723455429077, + "logits/rejected": -1.1810615062713623, + "logps/chosen": -93.79662322998047, + "logps/rejected": -107.40061950683594, + "loss": 0.0061, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.11829134076833725, + "rewards/margins": 20.636348724365234, + "rewards/rejected": -20.754640579223633, + "step": 2570 + }, + { + "epoch": 1.18, + "learning_rate": 2.0251901944209636e-07, + "logits/chosen": -1.2281471490859985, + "logits/rejected": -1.1688311100006104, + "logps/chosen": -91.79940795898438, + "logps/rejected": -107.32728576660156, + "loss": 0.0034, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.22485239803791046, + "rewards/margins": 20.858522415161133, + "rewards/rejected": -20.633668899536133, + "step": 2580 + }, + { + "epoch": 1.18, + "learning_rate": 2.0201183431952664e-07, + "logits/chosen": -1.197676658630371, + "logits/rejected": -1.1506736278533936, + "logps/chosen": -81.2101821899414, + "logps/rejected": -109.7088623046875, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2717623710632324, + "rewards/margins": 21.169933319091797, + "rewards/rejected": -21.441696166992188, + "step": 2590 + }, + { + "epoch": 1.19, + "learning_rate": 2.0150464919695687e-07, + "logits/chosen": -1.2182395458221436, + "logits/rejected": -1.177594542503357, + "logps/chosen": -80.82548522949219, + "logps/rejected": -109.75965881347656, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1711565256118774, + "rewards/margins": 23.289653778076172, + "rewards/rejected": -22.118499755859375, + "step": 2600 + }, + { + "epoch": 1.19, + "eval_logits/chosen": -1.2811179161071777, + "eval_logits/rejected": -1.2195229530334473, + "eval_logps/chosen": -87.84212493896484, + "eval_logps/rejected": -107.58037567138672, + "eval_loss": 0.007497187703847885, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.7951509952545166, + "eval_rewards/margins": 21.91447639465332, + "eval_rewards/rejected": -21.119325637817383, + "eval_runtime": 67.5783, + "eval_samples_per_second": 42.351, + "eval_steps_per_second": 2.649, + "step": 2600 + }, + { + "epoch": 1.19, + "learning_rate": 2.0099746407438715e-07, + "logits/chosen": -1.2012989521026611, + "logits/rejected": -1.173628807067871, + "logps/chosen": -84.23133087158203, + "logps/rejected": -108.86763000488281, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.148093581199646, + "rewards/margins": 21.286386489868164, + "rewards/rejected": -20.138290405273438, + "step": 2610 + }, + { + "epoch": 1.2, + "learning_rate": 2.004902789518174e-07, + "logits/chosen": -1.236003041267395, + "logits/rejected": -1.1768968105316162, + "logps/chosen": -91.87477111816406, + "logps/rejected": -112.02117919921875, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2967796325683594, + "rewards/margins": 22.90878677368164, + "rewards/rejected": -21.612009048461914, + "step": 2620 + }, + { + "epoch": 1.2, + "learning_rate": 1.9998309382924768e-07, + "logits/chosen": -1.2255749702453613, + "logits/rejected": -1.169988989830017, + "logps/chosen": -89.92630004882812, + "logps/rejected": -109.48509216308594, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11678309738636017, + "rewards/margins": 20.700876235961914, + "rewards/rejected": -20.584091186523438, + "step": 2630 + }, + { + "epoch": 1.2, + "learning_rate": 1.9947590870667794e-07, + "logits/chosen": -1.2192347049713135, + "logits/rejected": -1.159459114074707, + "logps/chosen": -96.32530212402344, + "logps/rejected": -110.26582336425781, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2365353107452393, + "rewards/margins": 22.127044677734375, + "rewards/rejected": -19.8905086517334, + "step": 2640 + }, + { + "epoch": 1.21, + "learning_rate": 1.989687235841082e-07, + "logits/chosen": -1.2065198421478271, + "logits/rejected": -1.147120714187622, + "logps/chosen": -88.47337341308594, + "logps/rejected": -106.5348129272461, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9000838994979858, + "rewards/margins": 20.618175506591797, + "rewards/rejected": -19.718093872070312, + "step": 2650 + }, + { + "epoch": 1.21, + "learning_rate": 1.9846153846153844e-07, + "logits/chosen": -1.195985198020935, + "logits/rejected": -1.1491913795471191, + "logps/chosen": -84.20733642578125, + "logps/rejected": -109.67060852050781, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2371699810028076, + "rewards/margins": 22.225109100341797, + "rewards/rejected": -20.98794174194336, + "step": 2660 + }, + { + "epoch": 1.22, + "learning_rate": 1.9795435333896873e-07, + "logits/chosen": -1.1977856159210205, + "logits/rejected": -1.1537766456604004, + "logps/chosen": -91.16226196289062, + "logps/rejected": -111.68495178222656, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.734151601791382, + "rewards/margins": 23.330568313598633, + "rewards/rejected": -20.596416473388672, + "step": 2670 + }, + { + "epoch": 1.22, + "learning_rate": 1.9744716821639898e-07, + "logits/chosen": -1.224413275718689, + "logits/rejected": -1.1784846782684326, + "logps/chosen": -87.34507751464844, + "logps/rejected": -109.75943756103516, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3192319869995117, + "rewards/margins": 22.662378311157227, + "rewards/rejected": -21.343143463134766, + "step": 2680 + }, + { + "epoch": 1.23, + "learning_rate": 1.9693998309382926e-07, + "logits/chosen": -1.208308219909668, + "logits/rejected": -1.1619625091552734, + "logps/chosen": -88.23912048339844, + "logps/rejected": -107.49666595458984, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7308362126350403, + "rewards/margins": 20.509281158447266, + "rewards/rejected": -19.778446197509766, + "step": 2690 + }, + { + "epoch": 1.23, + "learning_rate": 1.964327979712595e-07, + "logits/chosen": -1.2484124898910522, + "logits/rejected": -1.1881217956542969, + "logps/chosen": -90.5484390258789, + "logps/rejected": -105.3963851928711, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22395610809326172, + "rewards/margins": 21.083341598510742, + "rewards/rejected": -21.30729866027832, + "step": 2700 + }, + { + "epoch": 1.23, + "eval_logits/chosen": -1.2836943864822388, + "eval_logits/rejected": -1.2240569591522217, + "eval_logps/chosen": -88.02273559570312, + "eval_logps/rejected": -107.38671875, + "eval_loss": 0.007047051563858986, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.704842746257782, + "eval_rewards/margins": 21.727344512939453, + "eval_rewards/rejected": -21.022504806518555, + "eval_runtime": 66.849, + "eval_samples_per_second": 42.813, + "eval_steps_per_second": 2.678, + "step": 2700 + }, + { + "epoch": 1.24, + "learning_rate": 1.9592561284868977e-07, + "logits/chosen": -1.2299364805221558, + "logits/rejected": -1.1793487071990967, + "logps/chosen": -92.32845306396484, + "logps/rejected": -109.91078186035156, + "loss": 0.0012, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9196799993515015, + "rewards/margins": 21.892927169799805, + "rewards/rejected": -20.973247528076172, + "step": 2710 + }, + { + "epoch": 1.24, + "learning_rate": 1.9541842772612002e-07, + "logits/chosen": -1.2148702144622803, + "logits/rejected": -1.1717993021011353, + "logps/chosen": -88.74729919433594, + "logps/rejected": -109.92634582519531, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4501422047615051, + "rewards/margins": 20.923179626464844, + "rewards/rejected": -20.47303581237793, + "step": 2720 + }, + { + "epoch": 1.25, + "learning_rate": 1.949112426035503e-07, + "logits/chosen": -1.2163503170013428, + "logits/rejected": -1.1693785190582275, + "logps/chosen": -87.26588439941406, + "logps/rejected": -110.53138732910156, + "loss": 0.0058, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.8547931909561157, + "rewards/margins": 21.771400451660156, + "rewards/rejected": -22.626192092895508, + "step": 2730 + }, + { + "epoch": 1.25, + "learning_rate": 1.9440405748098056e-07, + "logits/chosen": -1.2386237382888794, + "logits/rejected": -1.1867671012878418, + "logps/chosen": -89.03327941894531, + "logps/rejected": -112.5365219116211, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6293159127235413, + "rewards/margins": 22.378183364868164, + "rewards/rejected": -21.74886703491211, + "step": 2740 + }, + { + "epoch": 1.26, + "learning_rate": 1.938968723584108e-07, + "logits/chosen": -1.241208791732788, + "logits/rejected": -1.1873838901519775, + "logps/chosen": -88.9361801147461, + "logps/rejected": -111.81880187988281, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0905447006225586, + "rewards/margins": 23.877498626708984, + "rewards/rejected": -22.78695297241211, + "step": 2750 + }, + { + "epoch": 1.26, + "learning_rate": 1.9338968723584107e-07, + "logits/chosen": -1.2293826341629028, + "logits/rejected": -1.1767637729644775, + "logps/chosen": -84.17843627929688, + "logps/rejected": -110.3796157836914, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36438900232315063, + "rewards/margins": 22.36743927001953, + "rewards/rejected": -22.731830596923828, + "step": 2760 + }, + { + "epoch": 1.26, + "learning_rate": 1.9288250211327135e-07, + "logits/chosen": -1.2488285303115845, + "logits/rejected": -1.192888617515564, + "logps/chosen": -94.11280822753906, + "logps/rejected": -113.39571380615234, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8494230508804321, + "rewards/margins": 21.94849967956543, + "rewards/rejected": -22.797924041748047, + "step": 2770 + }, + { + "epoch": 1.27, + "learning_rate": 1.923753169907016e-07, + "logits/chosen": -1.2614643573760986, + "logits/rejected": -1.2098888158798218, + "logps/chosen": -89.56803894042969, + "logps/rejected": -112.6842041015625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0361621156334877, + "rewards/margins": 23.073833465576172, + "rewards/rejected": -23.10999298095703, + "step": 2780 + }, + { + "epoch": 1.27, + "learning_rate": 1.9186813186813186e-07, + "logits/chosen": -1.2534093856811523, + "logits/rejected": -1.1944758892059326, + "logps/chosen": -93.7116470336914, + "logps/rejected": -106.58601379394531, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6133219003677368, + "rewards/margins": 21.842636108398438, + "rewards/rejected": -21.22931671142578, + "step": 2790 + }, + { + "epoch": 1.28, + "learning_rate": 1.913609467455621e-07, + "logits/chosen": -1.245892882347107, + "logits/rejected": -1.1841986179351807, + "logps/chosen": -90.11885070800781, + "logps/rejected": -108.6605224609375, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8591505885124207, + "rewards/margins": 23.346960067749023, + "rewards/rejected": -22.487812042236328, + "step": 2800 + }, + { + "epoch": 1.28, + "eval_logits/chosen": -1.3369088172912598, + "eval_logits/rejected": -1.2667419910430908, + "eval_logps/chosen": -88.43457794189453, + "eval_logps/rejected": -109.54808807373047, + "eval_loss": 0.007251236122101545, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.4989199936389923, + "eval_rewards/margins": 22.602100372314453, + "eval_rewards/rejected": -22.103178024291992, + "eval_runtime": 65.6934, + "eval_samples_per_second": 43.566, + "eval_steps_per_second": 2.725, + "step": 2800 + }, + { + "epoch": 1.28, + "learning_rate": 1.908537616229924e-07, + "logits/chosen": -1.258395791053772, + "logits/rejected": -1.2066611051559448, + "logps/chosen": -88.32936096191406, + "logps/rejected": -113.46788024902344, + "loss": 0.0012, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.060993265360593796, + "rewards/margins": 23.58555030822754, + "rewards/rejected": -23.52455711364746, + "step": 2810 + }, + { + "epoch": 1.29, + "learning_rate": 1.9034657650042265e-07, + "logits/chosen": -1.2754733562469482, + "logits/rejected": -1.1980760097503662, + "logps/chosen": -93.66505432128906, + "logps/rejected": -109.4578857421875, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4866414964199066, + "rewards/margins": 22.506893157958984, + "rewards/rejected": -22.020254135131836, + "step": 2820 + }, + { + "epoch": 1.29, + "learning_rate": 1.8983939137785293e-07, + "logits/chosen": -1.2679641246795654, + "logits/rejected": -1.2068798542022705, + "logps/chosen": -89.0657958984375, + "logps/rejected": -111.84321594238281, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.362867146730423, + "rewards/margins": 23.33441162109375, + "rewards/rejected": -23.697277069091797, + "step": 2830 + }, + { + "epoch": 1.3, + "learning_rate": 1.8933220625528315e-07, + "logits/chosen": -1.2522757053375244, + "logits/rejected": -1.179939866065979, + "logps/chosen": -93.52415466308594, + "logps/rejected": -112.2892074584961, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2486017942428589, + "rewards/margins": 23.161243438720703, + "rewards/rejected": -21.912641525268555, + "step": 2840 + }, + { + "epoch": 1.3, + "learning_rate": 1.8882502113271343e-07, + "logits/chosen": -1.2578299045562744, + "logits/rejected": -1.193433165550232, + "logps/chosen": -89.56175994873047, + "logps/rejected": -114.70777893066406, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8356913328170776, + "rewards/margins": 23.007110595703125, + "rewards/rejected": -22.171415328979492, + "step": 2850 + }, + { + "epoch": 1.31, + "learning_rate": 1.883178360101437e-07, + "logits/chosen": -1.279767632484436, + "logits/rejected": -1.2183212041854858, + "logps/chosen": -89.8489990234375, + "logps/rejected": -108.4361572265625, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6225892305374146, + "rewards/margins": 21.247386932373047, + "rewards/rejected": -20.624794006347656, + "step": 2860 + }, + { + "epoch": 1.31, + "learning_rate": 1.8781065088757397e-07, + "logits/chosen": -1.3064444065093994, + "logits/rejected": -1.251308798789978, + "logps/chosen": -88.62841033935547, + "logps/rejected": -114.151123046875, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6001108288764954, + "rewards/margins": 23.617412567138672, + "rewards/rejected": -23.017301559448242, + "step": 2870 + }, + { + "epoch": 1.31, + "learning_rate": 1.8730346576500422e-07, + "logits/chosen": -1.2963197231292725, + "logits/rejected": -1.2211467027664185, + "logps/chosen": -89.53153991699219, + "logps/rejected": -116.00457763671875, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3336631059646606, + "rewards/margins": 24.389131546020508, + "rewards/rejected": -23.055469512939453, + "step": 2880 + }, + { + "epoch": 1.32, + "learning_rate": 1.8679628064243448e-07, + "logits/chosen": -1.2721803188323975, + "logits/rejected": -1.209424614906311, + "logps/chosen": -90.9636459350586, + "logps/rejected": -112.62530517578125, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8648780584335327, + "rewards/margins": 23.844661712646484, + "rewards/rejected": -22.97978401184082, + "step": 2890 + }, + { + "epoch": 1.32, + "learning_rate": 1.8628909551986473e-07, + "logits/chosen": -1.304807186126709, + "logits/rejected": -1.2427805662155151, + "logps/chosen": -83.58341979980469, + "logps/rejected": -114.4560546875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09369596093893051, + "rewards/margins": 23.40268325805664, + "rewards/rejected": -23.496379852294922, + "step": 2900 + }, + { + "epoch": 1.32, + "eval_logits/chosen": -1.3647960424423218, + "eval_logits/rejected": -1.2919734716415405, + "eval_logps/chosen": -88.49767303466797, + "eval_logps/rejected": -111.2530517578125, + "eval_loss": 0.007645368576049805, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.4673759341239929, + "eval_rewards/margins": 23.423046112060547, + "eval_rewards/rejected": -22.95566749572754, + "eval_runtime": 68.7318, + "eval_samples_per_second": 41.64, + "eval_steps_per_second": 2.604, + "step": 2900 + }, + { + "epoch": 1.33, + "learning_rate": 1.8578191039729501e-07, + "logits/chosen": -1.2682093381881714, + "logits/rejected": -1.2079681158065796, + "logps/chosen": -94.27190399169922, + "logps/rejected": -113.67183685302734, + "loss": 0.0047, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7304227352142334, + "rewards/margins": 24.069181442260742, + "rewards/rejected": -22.338756561279297, + "step": 2910 + }, + { + "epoch": 1.33, + "learning_rate": 1.8527472527472527e-07, + "logits/chosen": -1.2731274366378784, + "logits/rejected": -1.2099798917770386, + "logps/chosen": -90.7890625, + "logps/rejected": -106.4814224243164, + "loss": 0.0063, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.650849461555481, + "rewards/margins": 22.963346481323242, + "rewards/rejected": -22.312496185302734, + "step": 2920 + }, + { + "epoch": 1.34, + "learning_rate": 1.8476754015215555e-07, + "logits/chosen": -1.292454481124878, + "logits/rejected": -1.225913166999817, + "logps/chosen": -85.83187103271484, + "logps/rejected": -110.36527252197266, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2073954343795776, + "rewards/margins": 24.3538818359375, + "rewards/rejected": -23.146484375, + "step": 2930 + }, + { + "epoch": 1.34, + "learning_rate": 1.8426035502958578e-07, + "logits/chosen": -1.2939157485961914, + "logits/rejected": -1.241677165031433, + "logps/chosen": -84.26634216308594, + "logps/rejected": -108.85639953613281, + "loss": 0.0023, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.16938212513923645, + "rewards/margins": 22.67632293701172, + "rewards/rejected": -22.506940841674805, + "step": 2940 + }, + { + "epoch": 1.35, + "learning_rate": 1.8375316990701606e-07, + "logits/chosen": -1.2990548610687256, + "logits/rejected": -1.2253262996673584, + "logps/chosen": -85.23284149169922, + "logps/rejected": -111.3265609741211, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6182515025138855, + "rewards/margins": 23.335525512695312, + "rewards/rejected": -22.717273712158203, + "step": 2950 + }, + { + "epoch": 1.35, + "learning_rate": 1.832459847844463e-07, + "logits/chosen": -1.3059203624725342, + "logits/rejected": -1.2436602115631104, + "logps/chosen": -90.0726089477539, + "logps/rejected": -115.29248046875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.11955726146698, + "rewards/margins": 24.451736450195312, + "rewards/rejected": -23.332178115844727, + "step": 2960 + }, + { + "epoch": 1.36, + "learning_rate": 1.827387996618766e-07, + "logits/chosen": -1.2917280197143555, + "logits/rejected": -1.2361105680465698, + "logps/chosen": -83.81163024902344, + "logps/rejected": -110.36248779296875, + "loss": 0.0035, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.2116299867630005, + "rewards/margins": 23.077804565429688, + "rewards/rejected": -22.866174697875977, + "step": 2970 + }, + { + "epoch": 1.36, + "learning_rate": 1.8223161453930685e-07, + "logits/chosen": -1.2799623012542725, + "logits/rejected": -1.2227824926376343, + "logps/chosen": -87.18734741210938, + "logps/rejected": -111.6829605102539, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.030847549438476562, + "rewards/margins": 22.31467056274414, + "rewards/rejected": -22.283823013305664, + "step": 2980 + }, + { + "epoch": 1.36, + "learning_rate": 1.817244294167371e-07, + "logits/chosen": -1.2903432846069336, + "logits/rejected": -1.2311654090881348, + "logps/chosen": -92.79168701171875, + "logps/rejected": -119.3042984008789, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6965764164924622, + "rewards/margins": 24.21768569946289, + "rewards/rejected": -23.52111053466797, + "step": 2990 + }, + { + "epoch": 1.37, + "learning_rate": 1.8121724429416736e-07, + "logits/chosen": -1.2526795864105225, + "logits/rejected": -1.1851316690444946, + "logps/chosen": -89.67386627197266, + "logps/rejected": -108.59903717041016, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0026484727859497, + "rewards/margins": 23.317981719970703, + "rewards/rejected": -22.315330505371094, + "step": 3000 + }, + { + "epoch": 1.37, + "eval_logits/chosen": -1.3374384641647339, + "eval_logits/rejected": -1.2689754962921143, + "eval_logps/chosen": -87.6075668334961, + "eval_logps/rejected": -110.42879486083984, + "eval_loss": 0.007348579820245504, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.9124311804771423, + "eval_rewards/margins": 23.45596694946289, + "eval_rewards/rejected": -22.543535232543945, + "eval_runtime": 63.1636, + "eval_samples_per_second": 45.311, + "eval_steps_per_second": 2.834, + "step": 3000 + }, + { + "epoch": 1.37, + "learning_rate": 1.8071005917159764e-07, + "logits/chosen": -1.2701785564422607, + "logits/rejected": -1.2133753299713135, + "logps/chosen": -89.51029968261719, + "logps/rejected": -107.2208023071289, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4126553535461426, + "rewards/margins": 23.328351974487305, + "rewards/rejected": -21.915699005126953, + "step": 3010 + }, + { + "epoch": 1.38, + "learning_rate": 1.802028740490279e-07, + "logits/chosen": -1.2436578273773193, + "logits/rejected": -1.178425908088684, + "logps/chosen": -87.91740417480469, + "logps/rejected": -108.53468322753906, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.169164776802063, + "rewards/margins": 23.27883529663086, + "rewards/rejected": -22.109668731689453, + "step": 3020 + }, + { + "epoch": 1.38, + "learning_rate": 1.7969568892645814e-07, + "logits/chosen": -1.2551841735839844, + "logits/rejected": -1.1954948902130127, + "logps/chosen": -86.35130310058594, + "logps/rejected": -113.94222259521484, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3147376775741577, + "rewards/margins": 22.975399017333984, + "rewards/rejected": -22.660661697387695, + "step": 3030 + }, + { + "epoch": 1.39, + "learning_rate": 1.791885038038884e-07, + "logits/chosen": -1.2619261741638184, + "logits/rejected": -1.1995004415512085, + "logps/chosen": -89.64755249023438, + "logps/rejected": -115.27685546875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.632409691810608, + "rewards/margins": 25.825876235961914, + "rewards/rejected": -24.193464279174805, + "step": 3040 + }, + { + "epoch": 1.39, + "learning_rate": 1.7868131868131868e-07, + "logits/chosen": -1.2092974185943604, + "logits/rejected": -1.1581265926361084, + "logps/chosen": -94.53260803222656, + "logps/rejected": -113.39241790771484, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7243481278419495, + "rewards/margins": 24.363618850708008, + "rewards/rejected": -23.639272689819336, + "step": 3050 + }, + { + "epoch": 1.4, + "learning_rate": 1.7817413355874893e-07, + "logits/chosen": -1.2218453884124756, + "logits/rejected": -1.1740856170654297, + "logps/chosen": -87.55658721923828, + "logps/rejected": -112.15279388427734, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7927554845809937, + "rewards/margins": 24.38910484313965, + "rewards/rejected": -23.59635353088379, + "step": 3060 + }, + { + "epoch": 1.4, + "learning_rate": 1.7766694843617921e-07, + "logits/chosen": -1.272851586341858, + "logits/rejected": -1.206168293952942, + "logps/chosen": -83.55747985839844, + "logps/rejected": -120.4216079711914, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5537545084953308, + "rewards/margins": 24.476547241210938, + "rewards/rejected": -23.922794342041016, + "step": 3070 + }, + { + "epoch": 1.41, + "learning_rate": 1.7715976331360944e-07, + "logits/chosen": -1.2406799793243408, + "logits/rejected": -1.1959577798843384, + "logps/chosen": -88.22968292236328, + "logps/rejected": -118.29911041259766, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6699932813644409, + "rewards/margins": 24.441137313842773, + "rewards/rejected": -23.77114486694336, + "step": 3080 + }, + { + "epoch": 1.41, + "learning_rate": 1.7665257819103972e-07, + "logits/chosen": -1.2692559957504272, + "logits/rejected": -1.1990225315093994, + "logps/chosen": -91.19334411621094, + "logps/rejected": -115.9480209350586, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4864407777786255, + "rewards/margins": 24.107561111450195, + "rewards/rejected": -22.62112045288086, + "step": 3090 + }, + { + "epoch": 1.41, + "learning_rate": 1.7614539306846998e-07, + "logits/chosen": -1.2366522550582886, + "logits/rejected": -1.1838405132293701, + "logps/chosen": -89.22175598144531, + "logps/rejected": -109.91280364990234, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0529274940490723, + "rewards/margins": 23.17164421081543, + "rewards/rejected": -22.118717193603516, + "step": 3100 + }, + { + "epoch": 1.41, + "eval_logits/chosen": -1.3056086301803589, + "eval_logits/rejected": -1.2372196912765503, + "eval_logps/chosen": -87.62897491455078, + "eval_logps/rejected": -110.38069915771484, + "eval_loss": 0.007260460406541824, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.9017306566238403, + "eval_rewards/margins": 23.421215057373047, + "eval_rewards/rejected": -22.519485473632812, + "eval_runtime": 74.4757, + "eval_samples_per_second": 38.429, + "eval_steps_per_second": 2.403, + "step": 3100 + }, + { + "epoch": 1.42, + "learning_rate": 1.7563820794590026e-07, + "logits/chosen": -1.2446845769882202, + "logits/rejected": -1.1912556886672974, + "logps/chosen": -87.75724792480469, + "logps/rejected": -117.93830871582031, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7617045640945435, + "rewards/margins": 24.722881317138672, + "rewards/rejected": -23.961177825927734, + "step": 3110 + }, + { + "epoch": 1.42, + "learning_rate": 1.751310228233305e-07, + "logits/chosen": -1.2533457279205322, + "logits/rejected": -1.1890594959259033, + "logps/chosen": -94.13309478759766, + "logps/rejected": -112.0062026977539, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5000749826431274, + "rewards/margins": 22.833240509033203, + "rewards/rejected": -22.333166122436523, + "step": 3120 + }, + { + "epoch": 1.43, + "learning_rate": 1.7462383770076077e-07, + "logits/chosen": -1.2367146015167236, + "logits/rejected": -1.178846001625061, + "logps/chosen": -88.32302856445312, + "logps/rejected": -115.77542877197266, + "loss": 0.0076, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.37687280774116516, + "rewards/margins": 24.140419006347656, + "rewards/rejected": -23.763545989990234, + "step": 3130 + }, + { + "epoch": 1.43, + "learning_rate": 1.7411665257819102e-07, + "logits/chosen": -1.2372697591781616, + "logits/rejected": -1.1662713289260864, + "logps/chosen": -96.11444091796875, + "logps/rejected": -114.61024475097656, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8727655410766602, + "rewards/margins": 24.095346450805664, + "rewards/rejected": -23.222579956054688, + "step": 3140 + }, + { + "epoch": 1.44, + "learning_rate": 1.736094674556213e-07, + "logits/chosen": -1.2260363101959229, + "logits/rejected": -1.169950246810913, + "logps/chosen": -87.82176208496094, + "logps/rejected": -114.6717529296875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6339221000671387, + "rewards/margins": 23.64284324645996, + "rewards/rejected": -23.00891876220703, + "step": 3150 + }, + { + "epoch": 1.44, + "learning_rate": 1.7310228233305156e-07, + "logits/chosen": -1.2591478824615479, + "logits/rejected": -1.2040024995803833, + "logps/chosen": -89.02254486083984, + "logps/rejected": -113.30888366699219, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9198814630508423, + "rewards/margins": 23.625865936279297, + "rewards/rejected": -22.705984115600586, + "step": 3160 + }, + { + "epoch": 1.45, + "learning_rate": 1.7259509721048184e-07, + "logits/chosen": -1.2532278299331665, + "logits/rejected": -1.2039217948913574, + "logps/chosen": -82.79835510253906, + "logps/rejected": -113.0461654663086, + "loss": 0.0045, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6827878952026367, + "rewards/margins": 23.46468734741211, + "rewards/rejected": -22.781898498535156, + "step": 3170 + }, + { + "epoch": 1.45, + "learning_rate": 1.7208791208791206e-07, + "logits/chosen": -1.272351622581482, + "logits/rejected": -1.2235658168792725, + "logps/chosen": -83.52709197998047, + "logps/rejected": -115.59749603271484, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9059032201766968, + "rewards/margins": 24.44829559326172, + "rewards/rejected": -23.54239273071289, + "step": 3180 + }, + { + "epoch": 1.46, + "learning_rate": 1.7158072696534235e-07, + "logits/chosen": -1.2341419458389282, + "logits/rejected": -1.178829550743103, + "logps/chosen": -90.00852966308594, + "logps/rejected": -108.0736312866211, + "loss": 0.0024, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.2999484539031982, + "rewards/margins": 22.940885543823242, + "rewards/rejected": -21.64093589782715, + "step": 3190 + }, + { + "epoch": 1.46, + "learning_rate": 1.710735418427726e-07, + "logits/chosen": -1.2229827642440796, + "logits/rejected": -1.1729035377502441, + "logps/chosen": -88.7994384765625, + "logps/rejected": -115.1880111694336, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18852441012859344, + "rewards/margins": 22.59259033203125, + "rewards/rejected": -22.781112670898438, + "step": 3200 + }, + { + "epoch": 1.46, + "eval_logits/chosen": -1.3309955596923828, + "eval_logits/rejected": -1.2624685764312744, + "eval_logps/chosen": -87.6993408203125, + "eval_logps/rejected": -111.36738586425781, + "eval_loss": 0.0076772235333919525, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.866541862487793, + "eval_rewards/margins": 23.879375457763672, + "eval_rewards/rejected": -23.012832641601562, + "eval_runtime": 71.9768, + "eval_samples_per_second": 39.763, + "eval_steps_per_second": 2.487, + "step": 3200 + }, + { + "epoch": 1.47, + "learning_rate": 1.7056635672020288e-07, + "logits/chosen": -1.2811453342437744, + "logits/rejected": -1.2151391506195068, + "logps/chosen": -92.49798583984375, + "logps/rejected": -116.91839599609375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1861944198608398, + "rewards/margins": 25.033100128173828, + "rewards/rejected": -23.846908569335938, + "step": 3210 + }, + { + "epoch": 1.47, + "learning_rate": 1.7005917159763313e-07, + "logits/chosen": -1.2741590738296509, + "logits/rejected": -1.2121143341064453, + "logps/chosen": -88.5379409790039, + "logps/rejected": -116.83935546875, + "loss": 0.0045, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4937240481376648, + "rewards/margins": 23.87834930419922, + "rewards/rejected": -23.384624481201172, + "step": 3220 + }, + { + "epoch": 1.47, + "learning_rate": 1.695519864750634e-07, + "logits/chosen": -1.2720822095870972, + "logits/rejected": -1.2110103368759155, + "logps/chosen": -90.10042572021484, + "logps/rejected": -108.63752746582031, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.890106201171875, + "rewards/margins": 22.506465911865234, + "rewards/rejected": -21.61635971069336, + "step": 3230 + }, + { + "epoch": 1.48, + "learning_rate": 1.6904480135249364e-07, + "logits/chosen": -1.2414791584014893, + "logits/rejected": -1.1807258129119873, + "logps/chosen": -90.51213073730469, + "logps/rejected": -115.80250549316406, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1200735569000244, + "rewards/margins": 25.453067779541016, + "rewards/rejected": -24.332996368408203, + "step": 3240 + }, + { + "epoch": 1.48, + "learning_rate": 1.6853761622992392e-07, + "logits/chosen": -1.2474098205566406, + "logits/rejected": -1.203169584274292, + "logps/chosen": -85.99417114257812, + "logps/rejected": -115.08979797363281, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23325538635253906, + "rewards/margins": 23.269723892211914, + "rewards/rejected": -23.036466598510742, + "step": 3250 + }, + { + "epoch": 1.49, + "learning_rate": 1.6803043110735418e-07, + "logits/chosen": -1.2636672258377075, + "logits/rejected": -1.197353482246399, + "logps/chosen": -84.11290740966797, + "logps/rejected": -111.87858581542969, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2561546564102173, + "rewards/margins": 24.462989807128906, + "rewards/rejected": -23.206836700439453, + "step": 3260 + }, + { + "epoch": 1.49, + "learning_rate": 1.6752324598478443e-07, + "logits/chosen": -1.2593055963516235, + "logits/rejected": -1.191775918006897, + "logps/chosen": -86.7191390991211, + "logps/rejected": -121.62376403808594, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4610673785209656, + "rewards/margins": 25.324344635009766, + "rewards/rejected": -25.785411834716797, + "step": 3270 + }, + { + "epoch": 1.5, + "learning_rate": 1.6701606086221469e-07, + "logits/chosen": -1.2484862804412842, + "logits/rejected": -1.1960498094558716, + "logps/chosen": -91.21693420410156, + "logps/rejected": -115.73272705078125, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2348252534866333, + "rewards/margins": 23.737483978271484, + "rewards/rejected": -23.502655029296875, + "step": 3280 + }, + { + "epoch": 1.5, + "learning_rate": 1.6650887573964497e-07, + "logits/chosen": -1.247483253479004, + "logits/rejected": -1.1900913715362549, + "logps/chosen": -89.03797149658203, + "logps/rejected": -114.04032135009766, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4475841522216797, + "rewards/margins": 24.663578033447266, + "rewards/rejected": -23.215991973876953, + "step": 3290 + }, + { + "epoch": 1.51, + "learning_rate": 1.6600169061707522e-07, + "logits/chosen": -1.3016645908355713, + "logits/rejected": -1.2365801334381104, + "logps/chosen": -94.5399398803711, + "logps/rejected": -124.09773254394531, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6547442078590393, + "rewards/margins": 25.51272201538086, + "rewards/rejected": -24.857975006103516, + "step": 3300 + }, + { + "epoch": 1.51, + "eval_logits/chosen": -1.3517431020736694, + "eval_logits/rejected": -1.2811920642852783, + "eval_logps/chosen": -87.5091781616211, + "eval_logps/rejected": -111.33209991455078, + "eval_loss": 0.007596523035317659, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.9616219401359558, + "eval_rewards/margins": 23.956809997558594, + "eval_rewards/rejected": -22.995187759399414, + "eval_runtime": 59.4047, + "eval_samples_per_second": 48.178, + "eval_steps_per_second": 3.013, + "step": 3300 + }, + { + "epoch": 1.51, + "learning_rate": 1.654945054945055e-07, + "logits/chosen": -1.2787243127822876, + "logits/rejected": -1.2220463752746582, + "logps/chosen": -85.47830963134766, + "logps/rejected": -120.3558349609375, + "loss": 0.0012, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.1819784939289093, + "rewards/margins": 24.017372131347656, + "rewards/rejected": -23.835391998291016, + "step": 3310 + }, + { + "epoch": 1.52, + "learning_rate": 1.6498732037193573e-07, + "logits/chosen": -1.2893264293670654, + "logits/rejected": -1.2257001399993896, + "logps/chosen": -85.7628402709961, + "logps/rejected": -111.8202896118164, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0795629024505615, + "rewards/margins": 24.557388305664062, + "rewards/rejected": -23.477825164794922, + "step": 3320 + }, + { + "epoch": 1.52, + "learning_rate": 1.64480135249366e-07, + "logits/chosen": -1.2428733110427856, + "logits/rejected": -1.1967283487319946, + "logps/chosen": -90.3432846069336, + "logps/rejected": -121.13531494140625, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1183116436004639, + "rewards/margins": 25.94301414489746, + "rewards/rejected": -24.824703216552734, + "step": 3330 + }, + { + "epoch": 1.52, + "learning_rate": 1.6397295012679627e-07, + "logits/chosen": -1.2334848642349243, + "logits/rejected": -1.1695020198822021, + "logps/chosen": -89.45833587646484, + "logps/rejected": -115.14144134521484, + "loss": 0.007, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8301811218261719, + "rewards/margins": 23.684751510620117, + "rewards/rejected": -22.854568481445312, + "step": 3340 + }, + { + "epoch": 1.53, + "learning_rate": 1.6346576500422655e-07, + "logits/chosen": -1.261792778968811, + "logits/rejected": -1.2009265422821045, + "logps/chosen": -91.06483459472656, + "logps/rejected": -116.6786880493164, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8044708371162415, + "rewards/margins": 24.910701751708984, + "rewards/rejected": -24.10622787475586, + "step": 3350 + }, + { + "epoch": 1.53, + "learning_rate": 1.629585798816568e-07, + "logits/chosen": -1.2624332904815674, + "logits/rejected": -1.2024790048599243, + "logps/chosen": -92.38378143310547, + "logps/rejected": -116.35567474365234, + "loss": 0.0045, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.27976271510124207, + "rewards/margins": 23.846553802490234, + "rewards/rejected": -24.12631607055664, + "step": 3360 + }, + { + "epoch": 1.54, + "learning_rate": 1.6245139475908705e-07, + "logits/chosen": -1.2883548736572266, + "logits/rejected": -1.2222377061843872, + "logps/chosen": -88.30838775634766, + "logps/rejected": -112.32978820800781, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10080569982528687, + "rewards/margins": 22.96854019165039, + "rewards/rejected": -22.867733001708984, + "step": 3370 + }, + { + "epoch": 1.54, + "learning_rate": 1.619442096365173e-07, + "logits/chosen": -1.3186638355255127, + "logits/rejected": -1.245281457901001, + "logps/chosen": -87.22611999511719, + "logps/rejected": -113.5263671875, + "loss": 0.0011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1867640018463135, + "rewards/margins": 25.683629989624023, + "rewards/rejected": -24.49686622619629, + "step": 3380 + }, + { + "epoch": 1.55, + "learning_rate": 1.614370245139476e-07, + "logits/chosen": -1.2895255088806152, + "logits/rejected": -1.229871392250061, + "logps/chosen": -85.41172790527344, + "logps/rejected": -117.4696273803711, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6948531866073608, + "rewards/margins": 25.749099731445312, + "rewards/rejected": -25.054248809814453, + "step": 3390 + }, + { + "epoch": 1.55, + "learning_rate": 1.6092983939137784e-07, + "logits/chosen": -1.2895100116729736, + "logits/rejected": -1.2215838432312012, + "logps/chosen": -92.64399719238281, + "logps/rejected": -113.7071304321289, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.008798837661743, + "rewards/margins": 25.56492805480957, + "rewards/rejected": -23.556129455566406, + "step": 3400 + }, + { + "epoch": 1.55, + "eval_logits/chosen": -1.352734923362732, + "eval_logits/rejected": -1.2827072143554688, + "eval_logps/chosen": -87.6773681640625, + "eval_logps/rejected": -113.02869415283203, + "eval_loss": 0.007990003563463688, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.8775248527526855, + "eval_rewards/margins": 24.72101593017578, + "eval_rewards/rejected": -23.843490600585938, + "eval_runtime": 59.8847, + "eval_samples_per_second": 47.792, + "eval_steps_per_second": 2.989, + "step": 3400 + }, + { + "epoch": 1.56, + "learning_rate": 1.6042265426880812e-07, + "logits/chosen": -1.2344064712524414, + "logits/rejected": -1.1821552515029907, + "logps/chosen": -87.24339294433594, + "logps/rejected": -113.21418762207031, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5112111568450928, + "rewards/margins": 24.323862075805664, + "rewards/rejected": -22.81264877319336, + "step": 3410 + }, + { + "epoch": 1.56, + "learning_rate": 1.5991546914623835e-07, + "logits/chosen": -1.2430731058120728, + "logits/rejected": -1.1846189498901367, + "logps/chosen": -86.92017364501953, + "logps/rejected": -111.88777160644531, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5619323253631592, + "rewards/margins": 25.513439178466797, + "rewards/rejected": -23.951505661010742, + "step": 3420 + }, + { + "epoch": 1.57, + "learning_rate": 1.5940828402366863e-07, + "logits/chosen": -1.2778552770614624, + "logits/rejected": -1.2257239818572998, + "logps/chosen": -89.74571228027344, + "logps/rejected": -122.40673828125, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9483524560928345, + "rewards/margins": 26.498586654663086, + "rewards/rejected": -25.550235748291016, + "step": 3430 + }, + { + "epoch": 1.57, + "learning_rate": 1.589010989010989e-07, + "logits/chosen": -1.2671973705291748, + "logits/rejected": -1.1991872787475586, + "logps/chosen": -95.77862548828125, + "logps/rejected": -120.44307708740234, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5015150308609009, + "rewards/margins": 25.031169891357422, + "rewards/rejected": -23.529653549194336, + "step": 3440 + }, + { + "epoch": 1.57, + "learning_rate": 1.5839391377852917e-07, + "logits/chosen": -1.283717393875122, + "logits/rejected": -1.2063400745391846, + "logps/chosen": -93.32177734375, + "logps/rejected": -118.2795639038086, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5961825847625732, + "rewards/margins": 25.887725830078125, + "rewards/rejected": -24.29154396057129, + "step": 3450 + }, + { + "epoch": 1.58, + "learning_rate": 1.5788672865595942e-07, + "logits/chosen": -1.2777965068817139, + "logits/rejected": -1.2139501571655273, + "logps/chosen": -86.36488342285156, + "logps/rejected": -112.5156021118164, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4618368148803711, + "rewards/margins": 24.505504608154297, + "rewards/rejected": -24.043668746948242, + "step": 3460 + }, + { + "epoch": 1.58, + "learning_rate": 1.5737954353338968e-07, + "logits/chosen": -1.2852230072021484, + "logits/rejected": -1.2266560792922974, + "logps/chosen": -85.028076171875, + "logps/rejected": -111.90921783447266, + "loss": 0.0054, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.6333560943603516, + "rewards/margins": 25.04153060913086, + "rewards/rejected": -23.40817642211914, + "step": 3470 + }, + { + "epoch": 1.59, + "learning_rate": 1.5687235841081993e-07, + "logits/chosen": -1.2799792289733887, + "logits/rejected": -1.216094970703125, + "logps/chosen": -88.36021423339844, + "logps/rejected": -112.99951171875, + "loss": 0.0067, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.2346739768981934, + "rewards/margins": 25.258432388305664, + "rewards/rejected": -24.023757934570312, + "step": 3480 + }, + { + "epoch": 1.59, + "learning_rate": 1.563651732882502e-07, + "logits/chosen": -1.2925515174865723, + "logits/rejected": -1.2144296169281006, + "logps/chosen": -93.78471374511719, + "logps/rejected": -115.75556945800781, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4354177713394165, + "rewards/margins": 26.061655044555664, + "rewards/rejected": -24.626237869262695, + "step": 3490 + }, + { + "epoch": 1.6, + "learning_rate": 1.5585798816568047e-07, + "logits/chosen": -1.2529594898223877, + "logits/rejected": -1.2021251916885376, + "logps/chosen": -85.00303649902344, + "logps/rejected": -112.36210632324219, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6849428415298462, + "rewards/margins": 23.76913070678711, + "rewards/rejected": -22.08418846130371, + "step": 3500 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.347659945487976, + "eval_logits/rejected": -1.2767060995101929, + "eval_logps/chosen": -86.58101654052734, + "eval_logps/rejected": -111.87799835205078, + "eval_loss": 0.007571995258331299, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.4257049560546875, + "eval_rewards/margins": 24.693849563598633, + "eval_rewards/rejected": -23.268142700195312, + "eval_runtime": 61.6155, + "eval_samples_per_second": 46.449, + "eval_steps_per_second": 2.905, + "step": 3500 + }, + { + "epoch": 1.6, + "learning_rate": 1.5535080304311072e-07, + "logits/chosen": -1.3136241436004639, + "logits/rejected": -1.2429287433624268, + "logps/chosen": -89.40177917480469, + "logps/rejected": -115.84840393066406, + "loss": 0.0039, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.674997091293335, + "rewards/margins": 24.841594696044922, + "rewards/rejected": -23.166595458984375, + "step": 3510 + }, + { + "epoch": 1.61, + "learning_rate": 1.5484361792054097e-07, + "logits/chosen": -1.2944393157958984, + "logits/rejected": -1.2253139019012451, + "logps/chosen": -89.92081451416016, + "logps/rejected": -117.0937271118164, + "loss": 0.0036, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8267130851745605, + "rewards/margins": 26.268218994140625, + "rewards/rejected": -25.441509246826172, + "step": 3520 + }, + { + "epoch": 1.61, + "learning_rate": 1.5433643279797126e-07, + "logits/chosen": -1.2919762134552002, + "logits/rejected": -1.2337154150009155, + "logps/chosen": -83.75240325927734, + "logps/rejected": -113.67716979980469, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2121895551681519, + "rewards/margins": 25.531957626342773, + "rewards/rejected": -24.319766998291016, + "step": 3530 + }, + { + "epoch": 1.62, + "learning_rate": 1.538292476754015e-07, + "logits/chosen": -1.2944412231445312, + "logits/rejected": -1.2345895767211914, + "logps/chosen": -87.15584564208984, + "logps/rejected": -115.55804443359375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4887280464172363, + "rewards/margins": 26.8122501373291, + "rewards/rejected": -24.323522567749023, + "step": 3540 + }, + { + "epoch": 1.62, + "learning_rate": 1.533220625528318e-07, + "logits/chosen": -1.2804213762283325, + "logits/rejected": -1.2162938117980957, + "logps/chosen": -89.55908203125, + "logps/rejected": -120.44891357421875, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0079282522201538, + "rewards/margins": 26.581329345703125, + "rewards/rejected": -25.573402404785156, + "step": 3550 + }, + { + "epoch": 1.62, + "learning_rate": 1.5281487743026202e-07, + "logits/chosen": -1.3119621276855469, + "logits/rejected": -1.227165937423706, + "logps/chosen": -96.74497985839844, + "logps/rejected": -121.7210464477539, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5257515907287598, + "rewards/margins": 27.811294555664062, + "rewards/rejected": -26.28554344177246, + "step": 3560 + }, + { + "epoch": 1.63, + "learning_rate": 1.523076923076923e-07, + "logits/chosen": -1.279449224472046, + "logits/rejected": -1.22541081905365, + "logps/chosen": -84.21186828613281, + "logps/rejected": -113.58978271484375, + "loss": 0.0034, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9092862010002136, + "rewards/margins": 25.38943099975586, + "rewards/rejected": -24.480144500732422, + "step": 3570 + }, + { + "epoch": 1.63, + "learning_rate": 1.5180050718512255e-07, + "logits/chosen": -1.2649726867675781, + "logits/rejected": -1.1972577571868896, + "logps/chosen": -91.45044708251953, + "logps/rejected": -119.5804443359375, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8337907791137695, + "rewards/margins": 28.143224716186523, + "rewards/rejected": -25.309436798095703, + "step": 3580 + }, + { + "epoch": 1.64, + "learning_rate": 1.5129332206255283e-07, + "logits/chosen": -1.2978521585464478, + "logits/rejected": -1.252637505531311, + "logps/chosen": -90.22964477539062, + "logps/rejected": -125.02496337890625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8120530843734741, + "rewards/margins": 26.514413833618164, + "rewards/rejected": -24.702360153198242, + "step": 3590 + }, + { + "epoch": 1.64, + "learning_rate": 1.507861369399831e-07, + "logits/chosen": -1.2789386510849, + "logits/rejected": -1.2158098220825195, + "logps/chosen": -90.51055908203125, + "logps/rejected": -118.33430480957031, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5107016563415527, + "rewards/margins": 28.50537109375, + "rewards/rejected": -25.994670867919922, + "step": 3600 + }, + { + "epoch": 1.64, + "eval_logits/chosen": -1.3777239322662354, + "eval_logits/rejected": -1.3042701482772827, + "eval_logps/chosen": -87.28597259521484, + "eval_logps/rejected": -114.07413482666016, + "eval_loss": 0.007842887192964554, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.0732280015945435, + "eval_rewards/margins": 25.439434051513672, + "eval_rewards/rejected": -24.3662052154541, + "eval_runtime": 63.3589, + "eval_samples_per_second": 45.171, + "eval_steps_per_second": 2.825, + "step": 3600 + }, + { + "epoch": 1.65, + "learning_rate": 1.5027895181741334e-07, + "logits/chosen": -1.370045781135559, + "logits/rejected": -1.283416748046875, + "logps/chosen": -90.81478118896484, + "logps/rejected": -112.16239929199219, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0164520740509033, + "rewards/margins": 23.93118667602539, + "rewards/rejected": -22.91473388671875, + "step": 3610 + }, + { + "epoch": 1.65, + "learning_rate": 1.497717666948436e-07, + "logits/chosen": -1.2713721990585327, + "logits/rejected": -1.216667890548706, + "logps/chosen": -84.43434143066406, + "logps/rejected": -113.650146484375, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3617621064186096, + "rewards/margins": 24.290042877197266, + "rewards/rejected": -23.92828369140625, + "step": 3620 + }, + { + "epoch": 1.66, + "learning_rate": 1.4926458157227388e-07, + "logits/chosen": -1.316611647605896, + "logits/rejected": -1.2415025234222412, + "logps/chosen": -88.00056457519531, + "logps/rejected": -118.13746643066406, + "loss": 0.0058, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7822587490081787, + "rewards/margins": 27.109432220458984, + "rewards/rejected": -25.327173233032227, + "step": 3630 + }, + { + "epoch": 1.66, + "learning_rate": 1.4875739644970413e-07, + "logits/chosen": -1.2669055461883545, + "logits/rejected": -1.198326587677002, + "logps/chosen": -92.14048767089844, + "logps/rejected": -115.46183776855469, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0600082874298096, + "rewards/margins": 25.716211318969727, + "rewards/rejected": -24.656200408935547, + "step": 3640 + }, + { + "epoch": 1.67, + "learning_rate": 1.4825021132713439e-07, + "logits/chosen": -1.2855212688446045, + "logits/rejected": -1.2362643480300903, + "logps/chosen": -85.3121337890625, + "logps/rejected": -116.80509948730469, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1400699615478516, + "rewards/margins": 25.843036651611328, + "rewards/rejected": -23.70296859741211, + "step": 3650 + }, + { + "epoch": 1.67, + "learning_rate": 1.4774302620456467e-07, + "logits/chosen": -1.2816253900527954, + "logits/rejected": -1.2244489192962646, + "logps/chosen": -86.08045959472656, + "logps/rejected": -120.78515625, + "loss": 0.0012, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7160287499427795, + "rewards/margins": 25.913681030273438, + "rewards/rejected": -25.197650909423828, + "step": 3660 + }, + { + "epoch": 1.68, + "learning_rate": 1.4723584108199492e-07, + "logits/chosen": -1.3148205280303955, + "logits/rejected": -1.2522845268249512, + "logps/chosen": -88.29036712646484, + "logps/rejected": -117.13260650634766, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6190974712371826, + "rewards/margins": 26.740299224853516, + "rewards/rejected": -25.121198654174805, + "step": 3670 + }, + { + "epoch": 1.68, + "learning_rate": 1.4672865595942518e-07, + "logits/chosen": -1.3006482124328613, + "logits/rejected": -1.2300279140472412, + "logps/chosen": -87.94609069824219, + "logps/rejected": -120.0467300415039, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.505680799484253, + "rewards/margins": 26.633773803710938, + "rewards/rejected": -24.128095626831055, + "step": 3680 + }, + { + "epoch": 1.68, + "learning_rate": 1.4622147083685546e-07, + "logits/chosen": -1.3202269077301025, + "logits/rejected": -1.252046823501587, + "logps/chosen": -95.1660385131836, + "logps/rejected": -121.23722839355469, + "loss": 0.0057, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.524958848953247, + "rewards/margins": 27.411617279052734, + "rewards/rejected": -25.886661529541016, + "step": 3690 + }, + { + "epoch": 1.69, + "learning_rate": 1.457142857142857e-07, + "logits/chosen": -1.2965933084487915, + "logits/rejected": -1.2345640659332275, + "logps/chosen": -83.83021545410156, + "logps/rejected": -117.0789794921875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2809300422668457, + "rewards/margins": 26.77134132385254, + "rewards/rejected": -24.49040985107422, + "step": 3700 + }, + { + "epoch": 1.69, + "eval_logits/chosen": -1.3869707584381104, + "eval_logits/rejected": -1.3094760179519653, + "eval_logps/chosen": -86.7977066040039, + "eval_logps/rejected": -115.47274017333984, + "eval_loss": 0.008259255439043045, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.3173613548278809, + "eval_rewards/margins": 26.382875442504883, + "eval_rewards/rejected": -25.065513610839844, + "eval_runtime": 63.3424, + "eval_samples_per_second": 45.183, + "eval_steps_per_second": 2.826, + "step": 3700 + }, + { + "epoch": 1.69, + "learning_rate": 1.4520710059171596e-07, + "logits/chosen": -1.2985725402832031, + "logits/rejected": -1.2297552824020386, + "logps/chosen": -90.36856079101562, + "logps/rejected": -116.03338623046875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.916562557220459, + "rewards/margins": 27.512588500976562, + "rewards/rejected": -24.596027374267578, + "step": 3710 + }, + { + "epoch": 1.7, + "learning_rate": 1.4469991546914622e-07, + "logits/chosen": -1.3176157474517822, + "logits/rejected": -1.247491478919983, + "logps/chosen": -85.81680297851562, + "logps/rejected": -112.21038818359375, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.835550308227539, + "rewards/margins": 26.416223526000977, + "rewards/rejected": -24.580673217773438, + "step": 3720 + }, + { + "epoch": 1.7, + "learning_rate": 1.441927303465765e-07, + "logits/chosen": -1.3307037353515625, + "logits/rejected": -1.2499626874923706, + "logps/chosen": -90.99395751953125, + "logps/rejected": -117.75846099853516, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8873962163925171, + "rewards/margins": 25.621952056884766, + "rewards/rejected": -24.734556198120117, + "step": 3730 + }, + { + "epoch": 1.71, + "learning_rate": 1.4368554522400675e-07, + "logits/chosen": -1.2795710563659668, + "logits/rejected": -1.2092183828353882, + "logps/chosen": -90.55952453613281, + "logps/rejected": -121.86543273925781, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7004776000976562, + "rewards/margins": 29.03326988220215, + "rewards/rejected": -26.332794189453125, + "step": 3740 + }, + { + "epoch": 1.71, + "learning_rate": 1.43178360101437e-07, + "logits/chosen": -1.3224431276321411, + "logits/rejected": -1.2512967586517334, + "logps/chosen": -84.958984375, + "logps/rejected": -113.85465240478516, + "loss": 0.0016, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.3535866737365723, + "rewards/margins": 26.663818359375, + "rewards/rejected": -24.310230255126953, + "step": 3750 + }, + { + "epoch": 1.72, + "learning_rate": 1.426711749788673e-07, + "logits/chosen": -1.3394898176193237, + "logits/rejected": -1.2797186374664307, + "logps/chosen": -83.2117919921875, + "logps/rejected": -118.57608795166016, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2878071069717407, + "rewards/margins": 26.771793365478516, + "rewards/rejected": -25.483983993530273, + "step": 3760 + }, + { + "epoch": 1.72, + "learning_rate": 1.4216398985629754e-07, + "logits/chosen": -1.2791587114334106, + "logits/rejected": -1.2220796346664429, + "logps/chosen": -87.02328491210938, + "logps/rejected": -123.32960510253906, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.185002088546753, + "rewards/margins": 27.750438690185547, + "rewards/rejected": -25.565439224243164, + "step": 3770 + }, + { + "epoch": 1.73, + "learning_rate": 1.416568047337278e-07, + "logits/chosen": -1.3284387588500977, + "logits/rejected": -1.2588412761688232, + "logps/chosen": -88.56816101074219, + "logps/rejected": -118.0064468383789, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4277937412261963, + "rewards/margins": 26.984973907470703, + "rewards/rejected": -25.557178497314453, + "step": 3780 + }, + { + "epoch": 1.73, + "learning_rate": 1.4114961961115805e-07, + "logits/chosen": -1.3133352994918823, + "logits/rejected": -1.248510718345642, + "logps/chosen": -92.50769805908203, + "logps/rejected": -114.99954986572266, + "loss": 0.0046, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.1458348035812378, + "rewards/margins": 24.03805160522461, + "rewards/rejected": -22.89221954345703, + "step": 3790 + }, + { + "epoch": 1.73, + "learning_rate": 1.4064243448858833e-07, + "logits/chosen": -1.302869200706482, + "logits/rejected": -1.2390058040618896, + "logps/chosen": -82.003662109375, + "logps/rejected": -112.65030670166016, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.544621706008911, + "rewards/margins": 26.4320068359375, + "rewards/rejected": -23.887386322021484, + "step": 3800 + }, + { + "epoch": 1.73, + "eval_logits/chosen": -1.3750476837158203, + "eval_logits/rejected": -1.29920494556427, + "eval_logps/chosen": -85.6637191772461, + "eval_logps/rejected": -111.03407287597656, + "eval_loss": 0.007216113153845072, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.884352207183838, + "eval_rewards/margins": 24.73053741455078, + "eval_rewards/rejected": -22.84618377685547, + "eval_runtime": 62.1547, + "eval_samples_per_second": 46.046, + "eval_steps_per_second": 2.88, + "step": 3800 + }, + { + "epoch": 1.74, + "learning_rate": 1.401352493660186e-07, + "logits/chosen": -1.2911725044250488, + "logits/rejected": -1.2250169515609741, + "logps/chosen": -84.48226928710938, + "logps/rejected": -108.87812805175781, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1833953857421875, + "rewards/margins": 23.128978729248047, + "rewards/rejected": -21.94558334350586, + "step": 3810 + }, + { + "epoch": 1.74, + "learning_rate": 1.3962806424344884e-07, + "logits/chosen": -1.320176362991333, + "logits/rejected": -1.2417948246002197, + "logps/chosen": -87.6998519897461, + "logps/rejected": -110.061279296875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8369624614715576, + "rewards/margins": 24.469478607177734, + "rewards/rejected": -22.63251495361328, + "step": 3820 + }, + { + "epoch": 1.75, + "learning_rate": 1.3912087912087912e-07, + "logits/chosen": -1.3057355880737305, + "logits/rejected": -1.23427414894104, + "logps/chosen": -84.95487976074219, + "logps/rejected": -113.34101867675781, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.752861976623535, + "rewards/margins": 27.136266708374023, + "rewards/rejected": -24.383405685424805, + "step": 3830 + }, + { + "epoch": 1.75, + "learning_rate": 1.3861369399830938e-07, + "logits/chosen": -1.3030786514282227, + "logits/rejected": -1.236664056777954, + "logps/chosen": -83.6253890991211, + "logps/rejected": -113.64739990234375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3076999187469482, + "rewards/margins": 25.728729248046875, + "rewards/rejected": -24.421030044555664, + "step": 3840 + }, + { + "epoch": 1.76, + "learning_rate": 1.3810650887573963e-07, + "logits/chosen": -1.30088210105896, + "logits/rejected": -1.2418001890182495, + "logps/chosen": -85.37004852294922, + "logps/rejected": -113.1008071899414, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.105379104614258, + "rewards/margins": 25.393503189086914, + "rewards/rejected": -23.288122177124023, + "step": 3850 + }, + { + "epoch": 1.76, + "learning_rate": 1.3759932375316989e-07, + "logits/chosen": -1.2641175985336304, + "logits/rejected": -1.20892333984375, + "logps/chosen": -87.78987121582031, + "logps/rejected": -111.98150634765625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.467369556427002, + "rewards/margins": 25.168516159057617, + "rewards/rejected": -22.70115089416504, + "step": 3860 + }, + { + "epoch": 1.77, + "learning_rate": 1.3709213863060017e-07, + "logits/chosen": -1.328049898147583, + "logits/rejected": -1.2603827714920044, + "logps/chosen": -81.99150085449219, + "logps/rejected": -113.47479248046875, + "loss": 0.0012, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4597467184066772, + "rewards/margins": 24.159908294677734, + "rewards/rejected": -22.70016098022461, + "step": 3870 + }, + { + "epoch": 1.77, + "learning_rate": 1.3658495350803042e-07, + "logits/chosen": -1.2881757020950317, + "logits/rejected": -1.219227910041809, + "logps/chosen": -89.5770034790039, + "logps/rejected": -117.85916900634766, + "loss": 0.0024, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6704366207122803, + "rewards/margins": 26.334217071533203, + "rewards/rejected": -24.663782119750977, + "step": 3880 + }, + { + "epoch": 1.78, + "learning_rate": 1.3607776838546067e-07, + "logits/chosen": -1.2890180349349976, + "logits/rejected": -1.2281222343444824, + "logps/chosen": -87.47191619873047, + "logps/rejected": -117.46723937988281, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8343310356140137, + "rewards/margins": 27.57296371459961, + "rewards/rejected": -24.738632202148438, + "step": 3890 + }, + { + "epoch": 1.78, + "learning_rate": 1.3557058326289096e-07, + "logits/chosen": -1.2887780666351318, + "logits/rejected": -1.2126020193099976, + "logps/chosen": -92.53477478027344, + "logps/rejected": -118.53013610839844, + "loss": 0.004, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5731879472732544, + "rewards/margins": 27.09688949584961, + "rewards/rejected": -25.52370262145996, + "step": 3900 + }, + { + "epoch": 1.78, + "eval_logits/chosen": -1.3683326244354248, + "eval_logits/rejected": -1.2940618991851807, + "eval_logps/chosen": -86.10755920410156, + "eval_logps/rejected": -113.04136657714844, + "eval_loss": 0.00731794023886323, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.6624343395233154, + "eval_rewards/margins": 25.512258529663086, + "eval_rewards/rejected": -23.849822998046875, + "eval_runtime": 60.9365, + "eval_samples_per_second": 46.967, + "eval_steps_per_second": 2.937, + "step": 3900 + }, + { + "epoch": 1.78, + "learning_rate": 1.350633981403212e-07, + "logits/chosen": -1.2927218675613403, + "logits/rejected": -1.2228248119354248, + "logps/chosen": -83.60839080810547, + "logps/rejected": -112.75862121582031, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.901503324508667, + "rewards/margins": 24.796140670776367, + "rewards/rejected": -22.894638061523438, + "step": 3910 + }, + { + "epoch": 1.79, + "learning_rate": 1.3455621301775146e-07, + "logits/chosen": -1.2360032796859741, + "logits/rejected": -1.1930327415466309, + "logps/chosen": -85.62734985351562, + "logps/rejected": -117.69222259521484, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.328904390335083, + "rewards/margins": 26.03916358947754, + "rewards/rejected": -24.710262298583984, + "step": 3920 + }, + { + "epoch": 1.79, + "learning_rate": 1.3404902789518174e-07, + "logits/chosen": -1.299557089805603, + "logits/rejected": -1.2226934432983398, + "logps/chosen": -95.00071716308594, + "logps/rejected": -120.6982650756836, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4814507961273193, + "rewards/margins": 25.5311222076416, + "rewards/rejected": -24.049671173095703, + "step": 3930 + }, + { + "epoch": 1.8, + "learning_rate": 1.33541842772612e-07, + "logits/chosen": -1.2686429023742676, + "logits/rejected": -1.2006398439407349, + "logps/chosen": -87.60845184326172, + "logps/rejected": -113.25736999511719, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.330335021018982, + "rewards/margins": 24.519290924072266, + "rewards/rejected": -23.188955307006836, + "step": 3940 + }, + { + "epoch": 1.8, + "learning_rate": 1.3303465765004225e-07, + "logits/chosen": -1.3029606342315674, + "logits/rejected": -1.2397031784057617, + "logps/chosen": -86.67044067382812, + "logps/rejected": -116.4280776977539, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.413769245147705, + "rewards/margins": 26.642818450927734, + "rewards/rejected": -24.22905158996582, + "step": 3950 + }, + { + "epoch": 1.81, + "learning_rate": 1.325274725274725e-07, + "logits/chosen": -1.288098931312561, + "logits/rejected": -1.22249436378479, + "logps/chosen": -84.44660186767578, + "logps/rejected": -116.7946548461914, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.941779375076294, + "rewards/margins": 26.12166404724121, + "rewards/rejected": -24.17988395690918, + "step": 3960 + }, + { + "epoch": 1.81, + "learning_rate": 1.320202874049028e-07, + "logits/chosen": -1.2721738815307617, + "logits/rejected": -1.2212003469467163, + "logps/chosen": -86.49989318847656, + "logps/rejected": -116.10292053222656, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.964007019996643, + "rewards/margins": 25.622446060180664, + "rewards/rejected": -23.658437728881836, + "step": 3970 + }, + { + "epoch": 1.82, + "learning_rate": 1.3151310228233304e-07, + "logits/chosen": -1.2734956741333008, + "logits/rejected": -1.2074804306030273, + "logps/chosen": -87.59144592285156, + "logps/rejected": -116.40534973144531, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6690824031829834, + "rewards/margins": 26.692928314208984, + "rewards/rejected": -25.02384376525879, + "step": 3980 + }, + { + "epoch": 1.82, + "learning_rate": 1.310059171597633e-07, + "logits/chosen": -1.3229997158050537, + "logits/rejected": -1.2673537731170654, + "logps/chosen": -88.17668151855469, + "logps/rejected": -119.3404769897461, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.873644232749939, + "rewards/margins": 26.401302337646484, + "rewards/rejected": -24.527660369873047, + "step": 3990 + }, + { + "epoch": 1.83, + "learning_rate": 1.3049873203719358e-07, + "logits/chosen": -1.3080793619155884, + "logits/rejected": -1.2402544021606445, + "logps/chosen": -90.626953125, + "logps/rejected": -115.0836410522461, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6485347747802734, + "rewards/margins": 25.51757049560547, + "rewards/rejected": -23.86903953552246, + "step": 4000 + }, + { + "epoch": 1.83, + "eval_logits/chosen": -1.388069987297058, + "eval_logits/rejected": -1.311712622642517, + "eval_logps/chosen": -86.8644790649414, + "eval_logps/rejected": -114.29080963134766, + "eval_loss": 0.007556203752756119, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.283972144126892, + "eval_rewards/margins": 25.75852394104004, + "eval_rewards/rejected": -24.474552154541016, + "eval_runtime": 61.4383, + "eval_samples_per_second": 46.583, + "eval_steps_per_second": 2.913, + "step": 4000 + }, + { + "epoch": 1.83, + "learning_rate": 1.2999154691462383e-07, + "logits/chosen": -1.2937663793563843, + "logits/rejected": -1.2333722114562988, + "logps/chosen": -93.04609680175781, + "logps/rejected": -114.9658203125, + "loss": 0.0066, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.066020131111145, + "rewards/margins": 25.072368621826172, + "rewards/rejected": -24.00634765625, + "step": 4010 + }, + { + "epoch": 1.83, + "learning_rate": 1.2948436179205409e-07, + "logits/chosen": -1.3368138074874878, + "logits/rejected": -1.2809922695159912, + "logps/chosen": -82.21131134033203, + "logps/rejected": -116.16499328613281, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2117726802825928, + "rewards/margins": 25.279598236083984, + "rewards/rejected": -24.067827224731445, + "step": 4020 + }, + { + "epoch": 1.84, + "learning_rate": 1.2897717666948434e-07, + "logits/chosen": -1.275615930557251, + "logits/rejected": -1.2142200469970703, + "logps/chosen": -93.66702270507812, + "logps/rejected": -120.89031982421875, + "loss": 0.0043, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.48200541734695435, + "rewards/margins": 24.838642120361328, + "rewards/rejected": -24.356637954711914, + "step": 4030 + }, + { + "epoch": 1.84, + "learning_rate": 1.2846999154691462e-07, + "logits/chosen": -1.3146488666534424, + "logits/rejected": -1.2405660152435303, + "logps/chosen": -96.49334716796875, + "logps/rejected": -119.23211669921875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9090890884399414, + "rewards/margins": 27.04913330078125, + "rewards/rejected": -25.140043258666992, + "step": 4040 + }, + { + "epoch": 1.85, + "learning_rate": 1.2796280642434488e-07, + "logits/chosen": -1.307680606842041, + "logits/rejected": -1.2320573329925537, + "logps/chosen": -95.1331787109375, + "logps/rejected": -114.7768325805664, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5242226123809814, + "rewards/margins": 27.13631820678711, + "rewards/rejected": -24.61209487915039, + "step": 4050 + }, + { + "epoch": 1.85, + "learning_rate": 1.2745562130177513e-07, + "logits/chosen": -1.3010895252227783, + "logits/rejected": -1.2413908243179321, + "logps/chosen": -93.52859497070312, + "logps/rejected": -118.62088775634766, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5129265785217285, + "rewards/margins": 25.957473754882812, + "rewards/rejected": -24.444549560546875, + "step": 4060 + }, + { + "epoch": 1.86, + "learning_rate": 1.269484361792054e-07, + "logits/chosen": -1.2945194244384766, + "logits/rejected": -1.2368277311325073, + "logps/chosen": -90.33158874511719, + "logps/rejected": -120.31858825683594, + "loss": 0.0023, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.025995206087827682, + "rewards/margins": 24.914783477783203, + "rewards/rejected": -24.940778732299805, + "step": 4070 + }, + { + "epoch": 1.86, + "learning_rate": 1.2644125105663566e-07, + "logits/chosen": -1.298654317855835, + "logits/rejected": -1.2463370561599731, + "logps/chosen": -85.01954650878906, + "logps/rejected": -115.01509094238281, + "loss": 0.0028, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.912619411945343, + "rewards/margins": 25.490772247314453, + "rewards/rejected": -24.57815170288086, + "step": 4080 + }, + { + "epoch": 1.87, + "learning_rate": 1.2593406593406592e-07, + "logits/chosen": -1.282814383506775, + "logits/rejected": -1.2446370124816895, + "logps/chosen": -85.000732421875, + "logps/rejected": -119.21089935302734, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7353527545928955, + "rewards/margins": 26.011398315429688, + "rewards/rejected": -24.276042938232422, + "step": 4090 + }, + { + "epoch": 1.87, + "learning_rate": 1.2542688081149617e-07, + "logits/chosen": -1.2971045970916748, + "logits/rejected": -1.2302577495574951, + "logps/chosen": -90.2288589477539, + "logps/rejected": -119.21665954589844, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2725168466567993, + "rewards/margins": 25.95538902282715, + "rewards/rejected": -24.68286895751953, + "step": 4100 + }, + { + "epoch": 1.87, + "eval_logits/chosen": -1.3849560022354126, + "eval_logits/rejected": -1.30857515335083, + "eval_logps/chosen": -86.99442291259766, + "eval_logps/rejected": -114.5916748046875, + "eval_loss": 0.007762798108160496, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": 1.2190001010894775, + "eval_rewards/margins": 25.843978881835938, + "eval_rewards/rejected": -24.624980926513672, + "eval_runtime": 60.6192, + "eval_samples_per_second": 47.213, + "eval_steps_per_second": 2.953, + "step": 4100 + }, + { + "epoch": 1.88, + "learning_rate": 1.2491969568892645e-07, + "logits/chosen": -1.2938053607940674, + "logits/rejected": -1.2201104164123535, + "logps/chosen": -91.01365661621094, + "logps/rejected": -116.42720031738281, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.568861961364746, + "rewards/margins": 26.685710906982422, + "rewards/rejected": -24.116849899291992, + "step": 4110 + }, + { + "epoch": 1.88, + "learning_rate": 1.244125105663567e-07, + "logits/chosen": -1.3405344486236572, + "logits/rejected": -1.2672364711761475, + "logps/chosen": -91.08586120605469, + "logps/rejected": -121.0147705078125, + "loss": 0.0022, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.5391464233398438, + "rewards/margins": 27.237112045288086, + "rewards/rejected": -25.69796371459961, + "step": 4120 + }, + { + "epoch": 1.88, + "learning_rate": 1.2390532544378696e-07, + "logits/chosen": -1.2898027896881104, + "logits/rejected": -1.2514622211456299, + "logps/chosen": -82.71153259277344, + "logps/rejected": -122.4542236328125, + "loss": 0.0055, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.03917119652032852, + "rewards/margins": 26.051074981689453, + "rewards/rejected": -26.01190185546875, + "step": 4130 + }, + { + "epoch": 1.89, + "learning_rate": 1.2339814032121724e-07, + "logits/chosen": -1.3003642559051514, + "logits/rejected": -1.2193737030029297, + "logps/chosen": -92.31513214111328, + "logps/rejected": -114.4735336303711, + "loss": 0.0013, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3771212100982666, + "rewards/margins": 25.401681900024414, + "rewards/rejected": -24.02456283569336, + "step": 4140 + }, + { + "epoch": 1.89, + "learning_rate": 1.228909551986475e-07, + "logits/chosen": -1.3193917274475098, + "logits/rejected": -1.2444701194763184, + "logps/chosen": -88.08500671386719, + "logps/rejected": -111.34246826171875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7157142162322998, + "rewards/margins": 25.313156127929688, + "rewards/rejected": -23.59743881225586, + "step": 4150 + }, + { + "epoch": 1.9, + "learning_rate": 1.2238377007607775e-07, + "logits/chosen": -1.2875083684921265, + "logits/rejected": -1.2169303894042969, + "logps/chosen": -90.44905090332031, + "logps/rejected": -113.9192886352539, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2316787242889404, + "rewards/margins": 25.797632217407227, + "rewards/rejected": -24.56595230102539, + "step": 4160 + }, + { + "epoch": 1.9, + "learning_rate": 1.2187658495350803e-07, + "logits/chosen": -1.319364309310913, + "logits/rejected": -1.2405459880828857, + "logps/chosen": -94.96125793457031, + "logps/rejected": -118.59950256347656, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9828643798828125, + "rewards/margins": 25.974903106689453, + "rewards/rejected": -23.99203872680664, + "step": 4170 + }, + { + "epoch": 1.91, + "learning_rate": 1.213693998309383e-07, + "logits/chosen": -1.3195867538452148, + "logits/rejected": -1.2445495128631592, + "logps/chosen": -90.23202514648438, + "logps/rejected": -114.99327087402344, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1389697790145874, + "rewards/margins": 25.39142608642578, + "rewards/rejected": -25.252456665039062, + "step": 4180 + }, + { + "epoch": 1.91, + "learning_rate": 1.2086221470836854e-07, + "logits/chosen": -1.3217099905014038, + "logits/rejected": -1.248329997062683, + "logps/chosen": -89.2808609008789, + "logps/rejected": -123.01396179199219, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.657078742980957, + "rewards/margins": 27.067142486572266, + "rewards/rejected": -26.410064697265625, + "step": 4190 + }, + { + "epoch": 1.92, + "learning_rate": 1.203550295857988e-07, + "logits/chosen": -1.324310064315796, + "logits/rejected": -1.257927417755127, + "logps/chosen": -86.30921173095703, + "logps/rejected": -114.84613037109375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0646955966949463, + "rewards/margins": 25.24355697631836, + "rewards/rejected": -24.178863525390625, + "step": 4200 + }, + { + "epoch": 1.92, + "eval_logits/chosen": -1.4128456115722656, + "eval_logits/rejected": -1.331875205039978, + "eval_logps/chosen": -87.6678466796875, + "eval_logps/rejected": -114.6224594116211, + "eval_loss": 0.007396237924695015, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.8822879791259766, + "eval_rewards/margins": 25.522655487060547, + "eval_rewards/rejected": -24.640369415283203, + "eval_runtime": 63.2696, + "eval_samples_per_second": 45.235, + "eval_steps_per_second": 2.829, + "step": 4200 + }, + { + "epoch": 1.92, + "learning_rate": 1.1984784446322908e-07, + "logits/chosen": -1.3565280437469482, + "logits/rejected": -1.2859523296356201, + "logps/chosen": -85.88087463378906, + "logps/rejected": -116.72319030761719, + "loss": 0.0034, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.10283108055591583, + "rewards/margins": 24.91754150390625, + "rewards/rejected": -25.020374298095703, + "step": 4210 + }, + { + "epoch": 1.93, + "learning_rate": 1.1934065934065933e-07, + "logits/chosen": -1.3157243728637695, + "logits/rejected": -1.253631830215454, + "logps/chosen": -88.73226165771484, + "logps/rejected": -118.5372543334961, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7852961421012878, + "rewards/margins": 26.198617935180664, + "rewards/rejected": -25.41332244873047, + "step": 4220 + }, + { + "epoch": 1.93, + "learning_rate": 1.188334742180896e-07, + "logits/chosen": -1.3382046222686768, + "logits/rejected": -1.2745530605316162, + "logps/chosen": -87.59422302246094, + "logps/rejected": -120.77787017822266, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4094334542751312, + "rewards/margins": 26.14259910583496, + "rewards/rejected": -25.733165740966797, + "step": 4230 + }, + { + "epoch": 1.94, + "learning_rate": 1.1832628909551987e-07, + "logits/chosen": -1.3534907102584839, + "logits/rejected": -1.2823253870010376, + "logps/chosen": -84.35427856445312, + "logps/rejected": -119.73152923583984, + "loss": 0.0029, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9467264413833618, + "rewards/margins": 28.15091323852539, + "rewards/rejected": -26.20418357849121, + "step": 4240 + }, + { + "epoch": 1.94, + "learning_rate": 1.1781910397295012e-07, + "logits/chosen": -1.325655221939087, + "logits/rejected": -1.2503167390823364, + "logps/chosen": -89.13822937011719, + "logps/rejected": -122.24861145019531, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.596404790878296, + "rewards/margins": 26.64560317993164, + "rewards/rejected": -25.0491943359375, + "step": 4250 + }, + { + "epoch": 1.94, + "learning_rate": 1.1731191885038039e-07, + "logits/chosen": -1.2953070402145386, + "logits/rejected": -1.2415964603424072, + "logps/chosen": -89.16068267822266, + "logps/rejected": -121.19921875, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6085556745529175, + "rewards/margins": 25.759998321533203, + "rewards/rejected": -25.15144157409668, + "step": 4260 + }, + { + "epoch": 1.95, + "learning_rate": 1.1680473372781064e-07, + "logits/chosen": -1.314546823501587, + "logits/rejected": -1.2527072429656982, + "logps/chosen": -89.053466796875, + "logps/rejected": -118.50617980957031, + "loss": 0.0079, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5143841505050659, + "rewards/margins": 25.380834579467773, + "rewards/rejected": -24.866455078125, + "step": 4270 + }, + { + "epoch": 1.95, + "learning_rate": 1.1629754860524091e-07, + "logits/chosen": -1.29738450050354, + "logits/rejected": -1.256644606590271, + "logps/chosen": -91.60551452636719, + "logps/rejected": -119.3724136352539, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4654719829559326, + "rewards/margins": 25.70328140258789, + "rewards/rejected": -24.237810134887695, + "step": 4280 + }, + { + "epoch": 1.96, + "learning_rate": 1.1579036348267118e-07, + "logits/chosen": -1.3026115894317627, + "logits/rejected": -1.2353935241699219, + "logps/chosen": -88.94530487060547, + "logps/rejected": -110.451904296875, + "loss": 0.0014, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.12701252102851868, + "rewards/margins": 23.56112289428711, + "rewards/rejected": -23.43410873413086, + "step": 4290 + }, + { + "epoch": 1.96, + "learning_rate": 1.1528317836010143e-07, + "logits/chosen": -1.311436414718628, + "logits/rejected": -1.2566239833831787, + "logps/chosen": -85.2987289428711, + "logps/rejected": -115.095947265625, + "loss": 0.0046, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6091933846473694, + "rewards/margins": 25.31754493713379, + "rewards/rejected": -24.70834732055664, + "step": 4300 + }, + { + "epoch": 1.96, + "eval_logits/chosen": -1.4097875356674194, + "eval_logits/rejected": -1.3324825763702393, + "eval_logps/chosen": -87.3016357421875, + "eval_logps/rejected": -111.95087432861328, + "eval_loss": 0.006749654188752174, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.0653921365737915, + "eval_rewards/margins": 24.369966506958008, + "eval_rewards/rejected": -23.30457305908203, + "eval_runtime": 121.9318, + "eval_samples_per_second": 23.472, + "eval_steps_per_second": 1.468, + "step": 4300 + }, + { + "epoch": 1.97, + "learning_rate": 1.147759932375317e-07, + "logits/chosen": -1.3246426582336426, + "logits/rejected": -1.2590562105178833, + "logps/chosen": -88.42132568359375, + "logps/rejected": -115.56095886230469, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7354822158813477, + "rewards/margins": 25.661792755126953, + "rewards/rejected": -23.926311492919922, + "step": 4310 + }, + { + "epoch": 1.97, + "learning_rate": 1.1426880811496195e-07, + "logits/chosen": -1.3025567531585693, + "logits/rejected": -1.227550745010376, + "logps/chosen": -93.46360778808594, + "logps/rejected": -112.05072021484375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7918212413787842, + "rewards/margins": 25.10645866394043, + "rewards/rejected": -23.314638137817383, + "step": 4320 + }, + { + "epoch": 1.98, + "learning_rate": 1.1376162299239222e-07, + "logits/chosen": -1.3441492319107056, + "logits/rejected": -1.2833201885223389, + "logps/chosen": -87.40711975097656, + "logps/rejected": -111.749267578125, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4917819499969482, + "rewards/margins": 24.15626335144043, + "rewards/rejected": -22.664485931396484, + "step": 4330 + }, + { + "epoch": 1.98, + "learning_rate": 1.1325443786982247e-07, + "logits/chosen": -1.2675365209579468, + "logits/rejected": -1.2221524715423584, + "logps/chosen": -87.13706970214844, + "logps/rejected": -113.51161193847656, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4677601754665375, + "rewards/margins": 23.889986038208008, + "rewards/rejected": -23.42222785949707, + "step": 4340 + }, + { + "epoch": 1.99, + "learning_rate": 1.1274725274725274e-07, + "logits/chosen": -1.2820374965667725, + "logits/rejected": -1.224862813949585, + "logps/chosen": -86.58576965332031, + "logps/rejected": -113.4376220703125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2789913415908813, + "rewards/margins": 24.847978591918945, + "rewards/rejected": -23.568988800048828, + "step": 4350 + }, + { + "epoch": 1.99, + "learning_rate": 1.1224006762468301e-07, + "logits/chosen": -1.3038794994354248, + "logits/rejected": -1.2235076427459717, + "logps/chosen": -91.44725036621094, + "logps/rejected": -112.098876953125, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9584652185440063, + "rewards/margins": 24.83078384399414, + "rewards/rejected": -23.872318267822266, + "step": 4360 + }, + { + "epoch": 1.99, + "learning_rate": 1.1173288250211326e-07, + "logits/chosen": -1.3245998620986938, + "logits/rejected": -1.2639939785003662, + "logps/chosen": -88.39584350585938, + "logps/rejected": -113.70811462402344, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3476972579956055, + "rewards/margins": 23.937400817871094, + "rewards/rejected": -22.589706420898438, + "step": 4370 + }, + { + "epoch": 2.0, + "learning_rate": 1.1122569737954353e-07, + "logits/chosen": -1.3038419485092163, + "logits/rejected": -1.2349718809127808, + "logps/chosen": -88.70793151855469, + "logps/rejected": -110.3454360961914, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9998445510864258, + "rewards/margins": 24.013072967529297, + "rewards/rejected": -22.013225555419922, + "step": 4380 + }, + { + "epoch": 2.0, + "learning_rate": 1.1071851225697379e-07, + "logits/chosen": -1.3599039316177368, + "logits/rejected": -1.282843828201294, + "logps/chosen": -95.58447265625, + "logps/rejected": -119.10038757324219, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9651339650154114, + "rewards/margins": 25.86013412475586, + "rewards/rejected": -24.89499855041504, + "step": 4390 + }, + { + "epoch": 2.01, + "learning_rate": 1.1021132713440405e-07, + "logits/chosen": -1.2892515659332275, + "logits/rejected": -1.2328523397445679, + "logps/chosen": -88.33984375, + "logps/rejected": -111.24787902832031, + "loss": 0.0024, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.816326916217804, + "rewards/margins": 23.1959171295166, + "rewards/rejected": -22.379589080810547, + "step": 4400 + }, + { + "epoch": 2.01, + "eval_logits/chosen": -1.4092082977294922, + "eval_logits/rejected": -1.3323308229446411, + "eval_logps/chosen": -86.91348266601562, + "eval_logps/rejected": -111.49284362792969, + "eval_loss": 0.006608007475733757, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.2594695091247559, + "eval_rewards/margins": 24.33502960205078, + "eval_rewards/rejected": -23.075557708740234, + "eval_runtime": 62.0198, + "eval_samples_per_second": 46.147, + "eval_steps_per_second": 2.886, + "step": 4400 + }, + { + "epoch": 2.01, + "learning_rate": 1.0970414201183432e-07, + "logits/chosen": -1.2805674076080322, + "logits/rejected": -1.221649408340454, + "logps/chosen": -94.44866943359375, + "logps/rejected": -120.46419525146484, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9686487317085266, + "rewards/margins": 25.624675750732422, + "rewards/rejected": -24.65602684020996, + "step": 4410 + }, + { + "epoch": 2.02, + "learning_rate": 1.0919695688926457e-07, + "logits/chosen": -1.3352880477905273, + "logits/rejected": -1.2853882312774658, + "logps/chosen": -86.63716125488281, + "logps/rejected": -115.88981628417969, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9782888293266296, + "rewards/margins": 23.240489959716797, + "rewards/rejected": -22.262203216552734, + "step": 4420 + }, + { + "epoch": 2.02, + "learning_rate": 1.0868977176669484e-07, + "logits/chosen": -1.3116331100463867, + "logits/rejected": -1.2609537839889526, + "logps/chosen": -85.02494812011719, + "logps/rejected": -115.8293228149414, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2943823337554932, + "rewards/margins": 24.715749740600586, + "rewards/rejected": -23.421369552612305, + "step": 4430 + }, + { + "epoch": 2.03, + "learning_rate": 1.081825866441251e-07, + "logits/chosen": -1.3299988508224487, + "logits/rejected": -1.266905665397644, + "logps/chosen": -90.40599060058594, + "logps/rejected": -116.70439147949219, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.15012526512146, + "rewards/margins": 24.405237197875977, + "rewards/rejected": -23.25511360168457, + "step": 4440 + }, + { + "epoch": 2.03, + "learning_rate": 1.0767540152155536e-07, + "logits/chosen": -1.324263572692871, + "logits/rejected": -1.2538119554519653, + "logps/chosen": -83.78913116455078, + "logps/rejected": -115.94393157958984, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9307165145874023, + "rewards/margins": 24.19430160522461, + "rewards/rejected": -23.263586044311523, + "step": 4450 + }, + { + "epoch": 2.04, + "learning_rate": 1.0716821639898562e-07, + "logits/chosen": -1.340303659439087, + "logits/rejected": -1.2662428617477417, + "logps/chosen": -93.44474792480469, + "logps/rejected": -110.15861511230469, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.513580560684204, + "rewards/margins": 24.58599853515625, + "rewards/rejected": -23.072418212890625, + "step": 4460 + }, + { + "epoch": 2.04, + "learning_rate": 1.0666103127641589e-07, + "logits/chosen": -1.3335578441619873, + "logits/rejected": -1.2744941711425781, + "logps/chosen": -81.96378326416016, + "logps/rejected": -116.51377868652344, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.511701762676239, + "rewards/margins": 25.025680541992188, + "rewards/rejected": -24.51398277282715, + "step": 4470 + }, + { + "epoch": 2.04, + "learning_rate": 1.0615384615384615e-07, + "logits/chosen": -1.3683226108551025, + "logits/rejected": -1.2874228954315186, + "logps/chosen": -84.69481658935547, + "logps/rejected": -113.8635482788086, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6471071243286133, + "rewards/margins": 25.367168426513672, + "rewards/rejected": -23.720062255859375, + "step": 4480 + }, + { + "epoch": 2.05, + "learning_rate": 1.0564666103127641e-07, + "logits/chosen": -1.3335193395614624, + "logits/rejected": -1.2593390941619873, + "logps/chosen": -89.44517517089844, + "logps/rejected": -113.9664535522461, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6522144079208374, + "rewards/margins": 24.980897903442383, + "rewards/rejected": -23.328683853149414, + "step": 4490 + }, + { + "epoch": 2.05, + "learning_rate": 1.0513947590870668e-07, + "logits/chosen": -1.3129395246505737, + "logits/rejected": -1.2438514232635498, + "logps/chosen": -89.5646743774414, + "logps/rejected": -110.78108215332031, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7508271932601929, + "rewards/margins": 24.931434631347656, + "rewards/rejected": -24.180606842041016, + "step": 4500 + }, + { + "epoch": 2.05, + "eval_logits/chosen": -1.4130858182907104, + "eval_logits/rejected": -1.3323482275009155, + "eval_logps/chosen": -87.22586059570312, + "eval_logps/rejected": -112.64375305175781, + "eval_loss": 0.0067411912605166435, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": 1.1032801866531372, + "eval_rewards/margins": 24.75430679321289, + "eval_rewards/rejected": -23.651025772094727, + "eval_runtime": 68.0364, + "eval_samples_per_second": 42.066, + "eval_steps_per_second": 2.631, + "step": 4500 + }, + { + "epoch": 2.06, + "learning_rate": 1.0463229078613693e-07, + "logits/chosen": -1.338761568069458, + "logits/rejected": -1.2733943462371826, + "logps/chosen": -93.2102279663086, + "logps/rejected": -115.36387634277344, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.200361728668213, + "rewards/margins": 24.453805923461914, + "rewards/rejected": -23.25344467163086, + "step": 4510 + }, + { + "epoch": 2.06, + "learning_rate": 1.041251056635672e-07, + "logits/chosen": -1.3229678869247437, + "logits/rejected": -1.258108377456665, + "logps/chosen": -88.09375762939453, + "logps/rejected": -119.7026596069336, + "loss": 0.0055, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.234534740447998, + "rewards/margins": 26.626230239868164, + "rewards/rejected": -24.39169692993164, + "step": 4520 + }, + { + "epoch": 2.07, + "learning_rate": 1.0361792054099746e-07, + "logits/chosen": -1.344191312789917, + "logits/rejected": -1.2628560066223145, + "logps/chosen": -94.99612426757812, + "logps/rejected": -112.13182067871094, + "loss": 0.0055, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6532214879989624, + "rewards/margins": 25.270252227783203, + "rewards/rejected": -23.61703109741211, + "step": 4530 + }, + { + "epoch": 2.07, + "learning_rate": 1.0311073541842772e-07, + "logits/chosen": -1.3356167078018188, + "logits/rejected": -1.274715542793274, + "logps/chosen": -85.35197448730469, + "logps/rejected": -114.82647705078125, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3368360996246338, + "rewards/margins": 25.388629913330078, + "rewards/rejected": -24.051794052124023, + "step": 4540 + }, + { + "epoch": 2.08, + "learning_rate": 1.0260355029585799e-07, + "logits/chosen": -1.3028538227081299, + "logits/rejected": -1.243327021598816, + "logps/chosen": -83.73817443847656, + "logps/rejected": -119.57144927978516, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.675358235836029, + "rewards/margins": 25.816675186157227, + "rewards/rejected": -25.141313552856445, + "step": 4550 + }, + { + "epoch": 2.08, + "learning_rate": 1.0209636517328824e-07, + "logits/chosen": -1.3460599184036255, + "logits/rejected": -1.2746741771697998, + "logps/chosen": -86.41990661621094, + "logps/rejected": -114.42486572265625, + "loss": 0.0076, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9956108331680298, + "rewards/margins": 26.080272674560547, + "rewards/rejected": -25.084659576416016, + "step": 4560 + }, + { + "epoch": 2.09, + "learning_rate": 1.0158918005071851e-07, + "logits/chosen": -1.33854079246521, + "logits/rejected": -1.2720884084701538, + "logps/chosen": -88.3348617553711, + "logps/rejected": -110.352783203125, + "loss": 0.0055, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0480217933654785, + "rewards/margins": 24.384031295776367, + "rewards/rejected": -23.336009979248047, + "step": 4570 + }, + { + "epoch": 2.09, + "learning_rate": 1.0108199492814876e-07, + "logits/chosen": -1.3189232349395752, + "logits/rejected": -1.2537434101104736, + "logps/chosen": -93.42473602294922, + "logps/rejected": -117.38157653808594, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.220496416091919, + "rewards/margins": 25.104259490966797, + "rewards/rejected": -23.88376235961914, + "step": 4580 + }, + { + "epoch": 2.09, + "learning_rate": 1.0057480980557903e-07, + "logits/chosen": -1.346653938293457, + "logits/rejected": -1.2888588905334473, + "logps/chosen": -84.58158111572266, + "logps/rejected": -116.2260971069336, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9570630788803101, + "rewards/margins": 24.75169563293457, + "rewards/rejected": -23.794628143310547, + "step": 4590 + }, + { + "epoch": 2.1, + "learning_rate": 1.000676246830093e-07, + "logits/chosen": -1.3445775508880615, + "logits/rejected": -1.2613394260406494, + "logps/chosen": -88.18348693847656, + "logps/rejected": -117.32752990722656, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2772257328033447, + "rewards/margins": 27.420055389404297, + "rewards/rejected": -25.1428279876709, + "step": 4600 + }, + { + "epoch": 2.1, + "eval_logits/chosen": -1.4255520105361938, + "eval_logits/rejected": -1.3487104177474976, + "eval_logps/chosen": -87.3498306274414, + "eval_logps/rejected": -113.354248046875, + "eval_loss": 0.006937822792679071, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.0413002967834473, + "eval_rewards/margins": 25.047561645507812, + "eval_rewards/rejected": -24.006263732910156, + "eval_runtime": 70.6268, + "eval_samples_per_second": 40.523, + "eval_steps_per_second": 2.534, + "step": 4600 + }, + { + "epoch": 2.1, + "learning_rate": 9.956043956043955e-08, + "logits/chosen": -1.3094035387039185, + "logits/rejected": -1.2451339960098267, + "logps/chosen": -89.12696838378906, + "logps/rejected": -114.3353271484375, + "loss": 0.0036, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.26013678312301636, + "rewards/margins": 23.884912490844727, + "rewards/rejected": -23.624773025512695, + "step": 4610 + }, + { + "epoch": 2.11, + "learning_rate": 9.905325443786982e-08, + "logits/chosen": -1.3309595584869385, + "logits/rejected": -1.2776004076004028, + "logps/chosen": -86.14755249023438, + "logps/rejected": -116.1485366821289, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.36527928709983826, + "rewards/margins": 24.457759857177734, + "rewards/rejected": -24.092479705810547, + "step": 4620 + }, + { + "epoch": 2.11, + "learning_rate": 9.854606931530007e-08, + "logits/chosen": -1.3390415906906128, + "logits/rejected": -1.2725872993469238, + "logps/chosen": -91.37030029296875, + "logps/rejected": -115.81642150878906, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0659124851226807, + "rewards/margins": 25.772014617919922, + "rewards/rejected": -24.706104278564453, + "step": 4630 + }, + { + "epoch": 2.12, + "learning_rate": 9.803888419273034e-08, + "logits/chosen": -1.3206666707992554, + "logits/rejected": -1.2614423036575317, + "logps/chosen": -83.52031707763672, + "logps/rejected": -116.8504867553711, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7927557826042175, + "rewards/margins": 25.795413970947266, + "rewards/rejected": -25.002655029296875, + "step": 4640 + }, + { + "epoch": 2.12, + "learning_rate": 9.753169907016061e-08, + "logits/chosen": -1.3370873928070068, + "logits/rejected": -1.2674705982208252, + "logps/chosen": -87.10755157470703, + "logps/rejected": -119.5748519897461, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.217125654220581, + "rewards/margins": 27.1856632232666, + "rewards/rejected": -24.968536376953125, + "step": 4650 + }, + { + "epoch": 2.13, + "learning_rate": 9.702451394759086e-08, + "logits/chosen": -1.3084181547164917, + "logits/rejected": -1.2384984493255615, + "logps/chosen": -87.81915283203125, + "logps/rejected": -113.61048889160156, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1231558322906494, + "rewards/margins": 25.29172706604004, + "rewards/rejected": -24.168569564819336, + "step": 4660 + }, + { + "epoch": 2.13, + "learning_rate": 9.651732882502113e-08, + "logits/chosen": -1.31204092502594, + "logits/rejected": -1.2542951107025146, + "logps/chosen": -92.28341674804688, + "logps/rejected": -115.6882553100586, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2532768249511719, + "rewards/margins": 24.862823486328125, + "rewards/rejected": -23.609546661376953, + "step": 4670 + }, + { + "epoch": 2.14, + "learning_rate": 9.601014370245138e-08, + "logits/chosen": -1.3849248886108398, + "logits/rejected": -1.3047869205474854, + "logps/chosen": -88.0652084350586, + "logps/rejected": -116.75108337402344, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.82236385345459, + "rewards/margins": 27.45639419555664, + "rewards/rejected": -24.634029388427734, + "step": 4680 + }, + { + "epoch": 2.14, + "learning_rate": 9.550295857988165e-08, + "logits/chosen": -1.3264617919921875, + "logits/rejected": -1.2588824033737183, + "logps/chosen": -90.63042449951172, + "logps/rejected": -117.13484954833984, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8007968664169312, + "rewards/margins": 25.313175201416016, + "rewards/rejected": -24.512378692626953, + "step": 4690 + }, + { + "epoch": 2.15, + "learning_rate": 9.49957734573119e-08, + "logits/chosen": -1.319097876548767, + "logits/rejected": -1.2523882389068604, + "logps/chosen": -91.26343536376953, + "logps/rejected": -122.9940414428711, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29426923394203186, + "rewards/margins": 26.019664764404297, + "rewards/rejected": -25.72539710998535, + "step": 4700 + }, + { + "epoch": 2.15, + "eval_logits/chosen": -1.431301236152649, + "eval_logits/rejected": -1.3519829511642456, + "eval_logps/chosen": -87.77189636230469, + "eval_logps/rejected": -114.70515441894531, + "eval_loss": 0.007204956840723753, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": 0.8302635550498962, + "eval_rewards/margins": 25.51198387145996, + "eval_rewards/rejected": -24.681718826293945, + "eval_runtime": 71.5845, + "eval_samples_per_second": 39.981, + "eval_steps_per_second": 2.501, + "step": 4700 + }, + { + "epoch": 2.15, + "learning_rate": 9.448858833474217e-08, + "logits/chosen": -1.3294093608856201, + "logits/rejected": -1.255859613418579, + "logps/chosen": -87.07064819335938, + "logps/rejected": -119.63526916503906, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.91957026720047, + "rewards/margins": 25.611663818359375, + "rewards/rejected": -24.692096710205078, + "step": 4710 + }, + { + "epoch": 2.15, + "learning_rate": 9.398140321217244e-08, + "logits/chosen": -1.3767739534378052, + "logits/rejected": -1.3070622682571411, + "logps/chosen": -91.01148986816406, + "logps/rejected": -119.0385971069336, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5754287242889404, + "rewards/margins": 25.93698501586914, + "rewards/rejected": -24.361553192138672, + "step": 4720 + }, + { + "epoch": 2.16, + "learning_rate": 9.34742180896027e-08, + "logits/chosen": -1.3255687952041626, + "logits/rejected": -1.2571724653244019, + "logps/chosen": -92.34770202636719, + "logps/rejected": -119.81795501708984, + "loss": 0.0067, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.29987969994544983, + "rewards/margins": 24.60814094543457, + "rewards/rejected": -24.90802001953125, + "step": 4730 + }, + { + "epoch": 2.16, + "learning_rate": 9.296703296703296e-08, + "logits/chosen": -1.364595651626587, + "logits/rejected": -1.2936162948608398, + "logps/chosen": -88.7773666381836, + "logps/rejected": -116.1494140625, + "loss": 0.0024, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.7363441586494446, + "rewards/margins": 25.116535186767578, + "rewards/rejected": -25.852880477905273, + "step": 4740 + }, + { + "epoch": 2.17, + "learning_rate": 9.245984784446322e-08, + "logits/chosen": -1.3715788125991821, + "logits/rejected": -1.3029972314834595, + "logps/chosen": -93.72126770019531, + "logps/rejected": -118.23783111572266, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3293616771697998, + "rewards/margins": 25.474594116210938, + "rewards/rejected": -24.145231246948242, + "step": 4750 + }, + { + "epoch": 2.17, + "learning_rate": 9.195266272189349e-08, + "logits/chosen": -1.3523025512695312, + "logits/rejected": -1.2690999507904053, + "logps/chosen": -93.32905578613281, + "logps/rejected": -118.7462158203125, + "loss": 0.0012, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9021531343460083, + "rewards/margins": 26.36541748046875, + "rewards/rejected": -25.463268280029297, + "step": 4760 + }, + { + "epoch": 2.18, + "learning_rate": 9.144547759932375e-08, + "logits/chosen": -1.294425368309021, + "logits/rejected": -1.2407166957855225, + "logps/chosen": -90.24799346923828, + "logps/rejected": -123.93524169921875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4086612462997437, + "rewards/margins": 25.56521224975586, + "rewards/rejected": -24.15654945373535, + "step": 4770 + }, + { + "epoch": 2.18, + "learning_rate": 9.093829247675401e-08, + "logits/chosen": -1.3438308238983154, + "logits/rejected": -1.275816798210144, + "logps/chosen": -91.6666488647461, + "logps/rejected": -119.28436279296875, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.383312702178955, + "rewards/margins": 27.003463745117188, + "rewards/rejected": -24.620147705078125, + "step": 4780 + }, + { + "epoch": 2.19, + "learning_rate": 9.043110735418427e-08, + "logits/chosen": -1.3451048135757446, + "logits/rejected": -1.2772449254989624, + "logps/chosen": -94.4778823852539, + "logps/rejected": -116.2429428100586, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.2315725088119507, + "rewards/margins": 25.15644073486328, + "rewards/rejected": -23.924867630004883, + "step": 4790 + }, + { + "epoch": 2.19, + "learning_rate": 8.992392223161453e-08, + "logits/chosen": -1.2999621629714966, + "logits/rejected": -1.2457572221755981, + "logps/chosen": -88.31067657470703, + "logps/rejected": -120.50489807128906, + "loss": 0.0067, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5283034443855286, + "rewards/margins": 25.667068481445312, + "rewards/rejected": -25.13875961303711, + "step": 4800 + }, + { + "epoch": 2.19, + "eval_logits/chosen": -1.4411685466766357, + "eval_logits/rejected": -1.3603928089141846, + "eval_logps/chosen": -87.89783477783203, + "eval_logps/rejected": -114.8197250366211, + "eval_loss": 0.007284725550562143, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.7672966718673706, + "eval_rewards/margins": 25.506298065185547, + "eval_rewards/rejected": -24.739002227783203, + "eval_runtime": 61.4965, + "eval_samples_per_second": 46.539, + "eval_steps_per_second": 2.911, + "step": 4800 + }, + { + "epoch": 2.2, + "learning_rate": 8.94167371090448e-08, + "logits/chosen": -1.3604772090911865, + "logits/rejected": -1.291609287261963, + "logps/chosen": -87.88396453857422, + "logps/rejected": -116.81595611572266, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6294358968734741, + "rewards/margins": 25.354211807250977, + "rewards/rejected": -24.724775314331055, + "step": 4810 + }, + { + "epoch": 2.2, + "learning_rate": 8.890955198647506e-08, + "logits/chosen": -1.327947735786438, + "logits/rejected": -1.2544294595718384, + "logps/chosen": -87.47128295898438, + "logps/rejected": -121.36673736572266, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21699205040931702, + "rewards/margins": 25.810955047607422, + "rewards/rejected": -25.593963623046875, + "step": 4820 + }, + { + "epoch": 2.2, + "learning_rate": 8.840236686390532e-08, + "logits/chosen": -1.3332288265228271, + "logits/rejected": -1.2797290086746216, + "logps/chosen": -84.39314270019531, + "logps/rejected": -118.59745788574219, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0452686548233032, + "rewards/margins": 26.805339813232422, + "rewards/rejected": -25.760074615478516, + "step": 4830 + }, + { + "epoch": 2.21, + "learning_rate": 8.789518174133559e-08, + "logits/chosen": -1.3392903804779053, + "logits/rejected": -1.273781418800354, + "logps/chosen": -87.46485137939453, + "logps/rejected": -118.17325592041016, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027283096686005592, + "rewards/margins": 25.02812385559082, + "rewards/rejected": -25.05540657043457, + "step": 4840 + }, + { + "epoch": 2.21, + "learning_rate": 8.738799661876584e-08, + "logits/chosen": -1.36063551902771, + "logits/rejected": -1.275144338607788, + "logps/chosen": -93.5079116821289, + "logps/rejected": -117.1824951171875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3975207209587097, + "rewards/margins": 25.79745101928711, + "rewards/rejected": -25.399932861328125, + "step": 4850 + }, + { + "epoch": 2.22, + "learning_rate": 8.688081149619611e-08, + "logits/chosen": -1.3441591262817383, + "logits/rejected": -1.2624187469482422, + "logps/chosen": -91.46955108642578, + "logps/rejected": -112.102783203125, + "loss": 0.0011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1940736770629883, + "rewards/margins": 25.44129753112793, + "rewards/rejected": -23.247224807739258, + "step": 4860 + }, + { + "epoch": 2.22, + "learning_rate": 8.637362637362636e-08, + "logits/chosen": -1.371538758277893, + "logits/rejected": -1.3088654279708862, + "logps/chosen": -86.52335357666016, + "logps/rejected": -117.4716567993164, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9726071357727051, + "rewards/margins": 26.16326332092285, + "rewards/rejected": -25.190654754638672, + "step": 4870 + }, + { + "epoch": 2.23, + "learning_rate": 8.586644125105663e-08, + "logits/chosen": -1.3846619129180908, + "logits/rejected": -1.307712197303772, + "logps/chosen": -87.76756286621094, + "logps/rejected": -115.74507141113281, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.3747789263725281, + "rewards/margins": 25.810970306396484, + "rewards/rejected": -25.436189651489258, + "step": 4880 + }, + { + "epoch": 2.23, + "learning_rate": 8.53592561284869e-08, + "logits/chosen": -1.3542847633361816, + "logits/rejected": -1.288946270942688, + "logps/chosen": -91.2932357788086, + "logps/rejected": -116.8852767944336, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8335541486740112, + "rewards/margins": 25.681522369384766, + "rewards/rejected": -24.847965240478516, + "step": 4890 + }, + { + "epoch": 2.24, + "learning_rate": 8.485207100591715e-08, + "logits/chosen": -1.3608229160308838, + "logits/rejected": -1.2844916582107544, + "logps/chosen": -88.2170181274414, + "logps/rejected": -120.30814361572266, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6866910457611084, + "rewards/margins": 27.874353408813477, + "rewards/rejected": -26.187664031982422, + "step": 4900 + }, + { + "epoch": 2.24, + "eval_logits/chosen": -1.461911678314209, + "eval_logits/rejected": -1.3814847469329834, + "eval_logps/chosen": -87.90396881103516, + "eval_logps/rejected": -115.87995147705078, + "eval_loss": 0.007340571843087673, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.7642290592193604, + "eval_rewards/margins": 26.033344268798828, + "eval_rewards/rejected": -25.26911735534668, + "eval_runtime": 68.6122, + "eval_samples_per_second": 41.713, + "eval_steps_per_second": 2.609, + "step": 4900 + }, + { + "epoch": 2.24, + "learning_rate": 8.434488588334742e-08, + "logits/chosen": -1.4112756252288818, + "logits/rejected": -1.3500181436538696, + "logps/chosen": -89.21757507324219, + "logps/rejected": -117.94172668457031, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4118437170982361, + "rewards/margins": 24.942344665527344, + "rewards/rejected": -24.530498504638672, + "step": 4910 + }, + { + "epoch": 2.25, + "learning_rate": 8.383770076077767e-08, + "logits/chosen": -1.3511896133422852, + "logits/rejected": -1.2767199277877808, + "logps/chosen": -95.7844009399414, + "logps/rejected": -117.9591293334961, + "loss": 0.0023, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.7757484912872314, + "rewards/margins": 26.916187286376953, + "rewards/rejected": -25.14043617248535, + "step": 4920 + }, + { + "epoch": 2.25, + "learning_rate": 8.333051563820794e-08, + "logits/chosen": -1.3630616664886475, + "logits/rejected": -1.2954555749893188, + "logps/chosen": -91.95133972167969, + "logps/rejected": -123.22148132324219, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0093523263931274, + "rewards/margins": 27.7642822265625, + "rewards/rejected": -26.754928588867188, + "step": 4930 + }, + { + "epoch": 2.25, + "learning_rate": 8.282333051563821e-08, + "logits/chosen": -1.3652262687683105, + "logits/rejected": -1.3002612590789795, + "logps/chosen": -89.00601959228516, + "logps/rejected": -117.67951965332031, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.025102138519287, + "rewards/margins": 26.249094009399414, + "rewards/rejected": -25.223987579345703, + "step": 4940 + }, + { + "epoch": 2.26, + "learning_rate": 8.231614539306846e-08, + "logits/chosen": -1.3879879713058472, + "logits/rejected": -1.3240267038345337, + "logps/chosen": -87.9173812866211, + "logps/rejected": -117.83023834228516, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08251948654651642, + "rewards/margins": 24.835966110229492, + "rewards/rejected": -24.75344467163086, + "step": 4950 + }, + { + "epoch": 2.26, + "learning_rate": 8.180896027049873e-08, + "logits/chosen": -1.372811198234558, + "logits/rejected": -1.2928178310394287, + "logps/chosen": -90.76931762695312, + "logps/rejected": -118.8313217163086, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.388689398765564, + "rewards/margins": 26.92373275756836, + "rewards/rejected": -25.535043716430664, + "step": 4960 + }, + { + "epoch": 2.27, + "learning_rate": 8.130177514792898e-08, + "logits/chosen": -1.3881031274795532, + "logits/rejected": -1.314194917678833, + "logps/chosen": -87.59803771972656, + "logps/rejected": -118.09504699707031, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4410366117954254, + "rewards/margins": 26.20395278930664, + "rewards/rejected": -25.762914657592773, + "step": 4970 + }, + { + "epoch": 2.27, + "learning_rate": 8.079459002535925e-08, + "logits/chosen": -1.3640211820602417, + "logits/rejected": -1.2896654605865479, + "logps/chosen": -87.9101333618164, + "logps/rejected": -117.8978500366211, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.964639663696289, + "rewards/margins": 28.1193790435791, + "rewards/rejected": -26.154743194580078, + "step": 4980 + }, + { + "epoch": 2.28, + "learning_rate": 8.02874049027895e-08, + "logits/chosen": -1.377068281173706, + "logits/rejected": -1.3123360872268677, + "logps/chosen": -87.87947845458984, + "logps/rejected": -125.39814758300781, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9372149705886841, + "rewards/margins": 27.093730926513672, + "rewards/rejected": -26.156518936157227, + "step": 4990 + }, + { + "epoch": 2.28, + "learning_rate": 7.978021978021977e-08, + "logits/chosen": -1.3721301555633545, + "logits/rejected": -1.302929162979126, + "logps/chosen": -88.95343780517578, + "logps/rejected": -122.36506652832031, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1758254766464233, + "rewards/margins": 27.310205459594727, + "rewards/rejected": -26.134380340576172, + "step": 5000 + }, + { + "epoch": 2.28, + "eval_logits/chosen": -1.4729403257369995, + "eval_logits/rejected": -1.3894855976104736, + "eval_logps/chosen": -88.23526763916016, + "eval_logps/rejected": -117.04609680175781, + "eval_loss": 0.0077310591004788876, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.5985845923423767, + "eval_rewards/margins": 26.450777053833008, + "eval_rewards/rejected": -25.852190017700195, + "eval_runtime": 65.1848, + "eval_samples_per_second": 43.906, + "eval_steps_per_second": 2.746, + "step": 5000 + }, + { + "epoch": 2.29, + "learning_rate": 7.927303465765004e-08, + "logits/chosen": -1.4071820974349976, + "logits/rejected": -1.325002908706665, + "logps/chosen": -92.00203704833984, + "logps/rejected": -121.43309020996094, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.40916553139686584, + "rewards/margins": 25.54302978515625, + "rewards/rejected": -25.133861541748047, + "step": 5010 + }, + { + "epoch": 2.29, + "learning_rate": 7.87658495350803e-08, + "logits/chosen": -1.382490873336792, + "logits/rejected": -1.290702223777771, + "logps/chosen": -91.81244659423828, + "logps/rejected": -116.6902847290039, + "loss": 0.0065, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.020965814590454, + "rewards/margins": 27.167251586914062, + "rewards/rejected": -26.146282196044922, + "step": 5020 + }, + { + "epoch": 2.3, + "learning_rate": 7.825866441251056e-08, + "logits/chosen": -1.359053373336792, + "logits/rejected": -1.290919542312622, + "logps/chosen": -86.95582580566406, + "logps/rejected": -120.83614349365234, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.065791368484497, + "rewards/margins": 29.087757110595703, + "rewards/rejected": -27.021968841552734, + "step": 5030 + }, + { + "epoch": 2.3, + "learning_rate": 7.775147928994082e-08, + "logits/chosen": -1.369732141494751, + "logits/rejected": -1.3190780878067017, + "logps/chosen": -88.25080108642578, + "logps/rejected": -122.08647155761719, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4733394682407379, + "rewards/margins": 26.999135971069336, + "rewards/rejected": -26.52579689025879, + "step": 5040 + }, + { + "epoch": 2.3, + "learning_rate": 7.724429416737108e-08, + "logits/chosen": -1.3806296586990356, + "logits/rejected": -1.314660668373108, + "logps/chosen": -82.57357025146484, + "logps/rejected": -113.80106353759766, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6215075254440308, + "rewards/margins": 25.933462142944336, + "rewards/rejected": -25.311954498291016, + "step": 5050 + }, + { + "epoch": 2.31, + "learning_rate": 7.673710904480135e-08, + "logits/chosen": -1.3427412509918213, + "logits/rejected": -1.277625560760498, + "logps/chosen": -88.83329772949219, + "logps/rejected": -113.25550842285156, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39279159903526306, + "rewards/margins": 24.52106475830078, + "rewards/rejected": -24.128273010253906, + "step": 5060 + }, + { + "epoch": 2.31, + "learning_rate": 7.62299239222316e-08, + "logits/chosen": -1.3516267538070679, + "logits/rejected": -1.290647268295288, + "logps/chosen": -93.58750915527344, + "logps/rejected": -119.11576843261719, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06238238885998726, + "rewards/margins": 25.322040557861328, + "rewards/rejected": -25.384424209594727, + "step": 5070 + }, + { + "epoch": 2.32, + "learning_rate": 7.572273879966187e-08, + "logits/chosen": -1.3757238388061523, + "logits/rejected": -1.306443452835083, + "logps/chosen": -93.31622314453125, + "logps/rejected": -117.56782531738281, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3956282138824463, + "rewards/margins": 26.834218978881836, + "rewards/rejected": -25.43859100341797, + "step": 5080 + }, + { + "epoch": 2.32, + "learning_rate": 7.521555367709213e-08, + "logits/chosen": -1.3766396045684814, + "logits/rejected": -1.3114879131317139, + "logps/chosen": -86.86039733886719, + "logps/rejected": -118.87870025634766, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0653996467590332, + "rewards/margins": 26.928573608398438, + "rewards/rejected": -25.863174438476562, + "step": 5090 + }, + { + "epoch": 2.33, + "learning_rate": 7.47083685545224e-08, + "logits/chosen": -1.38792884349823, + "logits/rejected": -1.31496262550354, + "logps/chosen": -86.50321197509766, + "logps/rejected": -119.1225357055664, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.047428846359253, + "rewards/margins": 26.675806045532227, + "rewards/rejected": -25.628376007080078, + "step": 5100 + }, + { + "epoch": 2.33, + "eval_logits/chosen": -1.4878861904144287, + "eval_logits/rejected": -1.4035993814468384, + "eval_logps/chosen": -88.03448486328125, + "eval_logps/rejected": -116.85780334472656, + "eval_loss": 0.007282613776624203, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.6989699006080627, + "eval_rewards/margins": 26.457014083862305, + "eval_rewards/rejected": -25.758041381835938, + "eval_runtime": 63.4536, + "eval_samples_per_second": 45.104, + "eval_steps_per_second": 2.821, + "step": 5100 + }, + { + "epoch": 2.33, + "learning_rate": 7.420118343195266e-08, + "logits/chosen": -1.3945187330245972, + "logits/rejected": -1.311867594718933, + "logps/chosen": -94.6859359741211, + "logps/rejected": -113.41032409667969, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1795772314071655, + "rewards/margins": 26.022451400756836, + "rewards/rejected": -24.842870712280273, + "step": 5110 + }, + { + "epoch": 2.34, + "learning_rate": 7.369399830938292e-08, + "logits/chosen": -1.3820443153381348, + "logits/rejected": -1.3166067600250244, + "logps/chosen": -83.39116668701172, + "logps/rejected": -115.35809326171875, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4604426920413971, + "rewards/margins": 26.040401458740234, + "rewards/rejected": -25.579957962036133, + "step": 5120 + }, + { + "epoch": 2.34, + "learning_rate": 7.318681318681318e-08, + "logits/chosen": -1.3669214248657227, + "logits/rejected": -1.3000479936599731, + "logps/chosen": -90.93084716796875, + "logps/rejected": -114.6349868774414, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9245045781135559, + "rewards/margins": 26.640636444091797, + "rewards/rejected": -25.716135025024414, + "step": 5130 + }, + { + "epoch": 2.35, + "learning_rate": 7.267962806424344e-08, + "logits/chosen": -1.3965649604797363, + "logits/rejected": -1.3084334135055542, + "logps/chosen": -97.05128479003906, + "logps/rejected": -123.03981018066406, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9901596903800964, + "rewards/margins": 26.782718658447266, + "rewards/rejected": -25.792556762695312, + "step": 5140 + }, + { + "epoch": 2.35, + "learning_rate": 7.21724429416737e-08, + "logits/chosen": -1.3820029497146606, + "logits/rejected": -1.2966136932373047, + "logps/chosen": -90.78387451171875, + "logps/rejected": -119.2174301147461, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5053150653839111, + "rewards/margins": 27.21875, + "rewards/rejected": -25.713436126708984, + "step": 5150 + }, + { + "epoch": 2.36, + "learning_rate": 7.166525781910397e-08, + "logits/chosen": -1.3862712383270264, + "logits/rejected": -1.315342903137207, + "logps/chosen": -88.68326568603516, + "logps/rejected": -119.2471923828125, + "loss": 0.0044, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5128425359725952, + "rewards/margins": 24.922550201416016, + "rewards/rejected": -25.435388565063477, + "step": 5160 + }, + { + "epoch": 2.36, + "learning_rate": 7.115807269653423e-08, + "logits/chosen": -1.358323335647583, + "logits/rejected": -1.2905691862106323, + "logps/chosen": -88.0818862915039, + "logps/rejected": -118.68809509277344, + "loss": 0.0043, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4642322063446045, + "rewards/margins": 26.244770050048828, + "rewards/rejected": -24.78053855895996, + "step": 5170 + }, + { + "epoch": 2.36, + "learning_rate": 7.06508875739645e-08, + "logits/chosen": -1.412334680557251, + "logits/rejected": -1.33579683303833, + "logps/chosen": -93.82710266113281, + "logps/rejected": -117.13541412353516, + "loss": 0.0023, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.39411622285842896, + "rewards/margins": 25.49358367919922, + "rewards/rejected": -25.099470138549805, + "step": 5180 + }, + { + "epoch": 2.37, + "learning_rate": 7.014370245139475e-08, + "logits/chosen": -1.3709720373153687, + "logits/rejected": -1.2929903268814087, + "logps/chosen": -91.2157974243164, + "logps/rejected": -124.2120132446289, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4855771064758301, + "rewards/margins": 27.795948028564453, + "rewards/rejected": -27.31036949157715, + "step": 5190 + }, + { + "epoch": 2.37, + "learning_rate": 6.963651732882502e-08, + "logits/chosen": -1.3689887523651123, + "logits/rejected": -1.2964820861816406, + "logps/chosen": -86.01536560058594, + "logps/rejected": -118.3643798828125, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1772948503494263, + "rewards/margins": 27.771255493164062, + "rewards/rejected": -26.59395980834961, + "step": 5200 + }, + { + "epoch": 2.37, + "eval_logits/chosen": -1.4872238636016846, + "eval_logits/rejected": -1.399623990058899, + "eval_logps/chosen": -88.03882598876953, + "eval_logps/rejected": -117.7686996459961, + "eval_loss": 0.007167758885771036, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.6967981457710266, + "eval_rewards/margins": 26.910293579101562, + "eval_rewards/rejected": -26.213491439819336, + "eval_runtime": 70.0341, + "eval_samples_per_second": 40.866, + "eval_steps_per_second": 2.556, + "step": 5200 + }, + { + "epoch": 2.38, + "learning_rate": 6.912933220625529e-08, + "logits/chosen": -1.4059669971466064, + "logits/rejected": -1.3302024602890015, + "logps/chosen": -88.3302230834961, + "logps/rejected": -118.16276550292969, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0470592975616455, + "rewards/margins": 26.200448989868164, + "rewards/rejected": -25.153392791748047, + "step": 5210 + }, + { + "epoch": 2.38, + "learning_rate": 6.862214708368554e-08, + "logits/chosen": -1.397664189338684, + "logits/rejected": -1.3320751190185547, + "logps/chosen": -85.62290954589844, + "logps/rejected": -119.31951904296875, + "loss": 0.0023, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.21012182533740997, + "rewards/margins": 25.69754409790039, + "rewards/rejected": -25.907669067382812, + "step": 5220 + }, + { + "epoch": 2.39, + "learning_rate": 6.811496196111581e-08, + "logits/chosen": -1.4220311641693115, + "logits/rejected": -1.3425581455230713, + "logps/chosen": -88.67344665527344, + "logps/rejected": -117.9163589477539, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3023020029067993, + "rewards/margins": 27.779937744140625, + "rewards/rejected": -26.477636337280273, + "step": 5230 + }, + { + "epoch": 2.39, + "learning_rate": 6.760777683854606e-08, + "logits/chosen": -1.3980010747909546, + "logits/rejected": -1.3161672353744507, + "logps/chosen": -94.78350830078125, + "logps/rejected": -118.20045471191406, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4626989364624023, + "rewards/margins": 27.554616928100586, + "rewards/rejected": -26.091922760009766, + "step": 5240 + }, + { + "epoch": 2.4, + "learning_rate": 6.710059171597633e-08, + "logits/chosen": -1.3799418210983276, + "logits/rejected": -1.3152214288711548, + "logps/chosen": -90.38761901855469, + "logps/rejected": -122.6275863647461, + "loss": 0.0044, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.18404527008533478, + "rewards/margins": 27.751272201538086, + "rewards/rejected": -27.56722640991211, + "step": 5250 + }, + { + "epoch": 2.4, + "learning_rate": 6.659340659340658e-08, + "logits/chosen": -1.3793939352035522, + "logits/rejected": -1.3092762231826782, + "logps/chosen": -90.41889953613281, + "logps/rejected": -118.99210357666016, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34509414434432983, + "rewards/margins": 27.324649810791016, + "rewards/rejected": -26.97955322265625, + "step": 5260 + }, + { + "epoch": 2.41, + "learning_rate": 6.608622147083685e-08, + "logits/chosen": -1.3813196420669556, + "logits/rejected": -1.3037269115447998, + "logps/chosen": -93.68791198730469, + "logps/rejected": -128.4936981201172, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1084160804748535, + "rewards/margins": 28.909637451171875, + "rewards/rejected": -27.801223754882812, + "step": 5270 + }, + { + "epoch": 2.41, + "learning_rate": 6.557903634826712e-08, + "logits/chosen": -1.3911049365997314, + "logits/rejected": -1.3268201351165771, + "logps/chosen": -87.09625244140625, + "logps/rejected": -122.59407806396484, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46324244141578674, + "rewards/margins": 27.979909896850586, + "rewards/rejected": -27.51666831970215, + "step": 5280 + }, + { + "epoch": 2.41, + "learning_rate": 6.507185122569737e-08, + "logits/chosen": -1.3915858268737793, + "logits/rejected": -1.3138229846954346, + "logps/chosen": -93.52770233154297, + "logps/rejected": -121.19853210449219, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6541392803192139, + "rewards/margins": 29.604787826538086, + "rewards/rejected": -27.95064926147461, + "step": 5290 + }, + { + "epoch": 2.42, + "learning_rate": 6.456466610312764e-08, + "logits/chosen": -1.4070789813995361, + "logits/rejected": -1.3349624872207642, + "logps/chosen": -91.31390380859375, + "logps/rejected": -119.59687805175781, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.320601224899292, + "rewards/margins": 28.13943862915039, + "rewards/rejected": -26.818838119506836, + "step": 5300 + }, + { + "epoch": 2.42, + "eval_logits/chosen": -1.501613736152649, + "eval_logits/rejected": -1.4148671627044678, + "eval_logps/chosen": -88.82596588134766, + "eval_logps/rejected": -119.5730209350586, + "eval_loss": 0.007715919055044651, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.30323532223701477, + "eval_rewards/margins": 27.418886184692383, + "eval_rewards/rejected": -27.115652084350586, + "eval_runtime": 60.0405, + "eval_samples_per_second": 47.668, + "eval_steps_per_second": 2.981, + "step": 5300 + }, + { + "epoch": 2.42, + "learning_rate": 6.40574809805579e-08, + "logits/chosen": -1.388594150543213, + "logits/rejected": -1.314540147781372, + "logps/chosen": -90.13387298583984, + "logps/rejected": -122.98262023925781, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6017175316810608, + "rewards/margins": 28.00093650817871, + "rewards/rejected": -27.399221420288086, + "step": 5310 + }, + { + "epoch": 2.43, + "learning_rate": 6.355029585798816e-08, + "logits/chosen": -1.4170560836791992, + "logits/rejected": -1.3456897735595703, + "logps/chosen": -88.29602813720703, + "logps/rejected": -122.81431579589844, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3829694986343384, + "rewards/margins": 29.370956420898438, + "rewards/rejected": -27.987987518310547, + "step": 5320 + }, + { + "epoch": 2.43, + "learning_rate": 6.304311073541843e-08, + "logits/chosen": -1.3887192010879517, + "logits/rejected": -1.3243298530578613, + "logps/chosen": -88.75332641601562, + "logps/rejected": -120.09466552734375, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2215611934661865, + "rewards/margins": 26.91391944885254, + "rewards/rejected": -25.692358016967773, + "step": 5330 + }, + { + "epoch": 2.44, + "learning_rate": 6.253592561284868e-08, + "logits/chosen": -1.4018067121505737, + "logits/rejected": -1.3270535469055176, + "logps/chosen": -93.98942565917969, + "logps/rejected": -125.354248046875, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.2943921089172363, + "rewards/margins": 28.14287757873535, + "rewards/rejected": -26.84848403930664, + "step": 5340 + }, + { + "epoch": 2.44, + "learning_rate": 6.202874049027895e-08, + "logits/chosen": -1.3864609003067017, + "logits/rejected": -1.3177391290664673, + "logps/chosen": -91.09075927734375, + "logps/rejected": -124.11067962646484, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8371120691299438, + "rewards/margins": 28.26397705078125, + "rewards/rejected": -27.426868438720703, + "step": 5350 + }, + { + "epoch": 2.45, + "learning_rate": 6.15215553677092e-08, + "logits/chosen": -1.4059492349624634, + "logits/rejected": -1.3294079303741455, + "logps/chosen": -93.74061584472656, + "logps/rejected": -119.74947357177734, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4449167251586914, + "rewards/margins": 27.587207794189453, + "rewards/rejected": -26.142292022705078, + "step": 5360 + }, + { + "epoch": 2.45, + "learning_rate": 6.101437024513947e-08, + "logits/chosen": -1.3770349025726318, + "logits/rejected": -1.3040486574172974, + "logps/chosen": -92.32209014892578, + "logps/rejected": -123.76420593261719, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0093997716903687, + "rewards/margins": 27.536449432373047, + "rewards/rejected": -26.527050018310547, + "step": 5370 + }, + { + "epoch": 2.46, + "learning_rate": 6.050718512256973e-08, + "logits/chosen": -1.3874174356460571, + "logits/rejected": -1.3079521656036377, + "logps/chosen": -89.07852172851562, + "logps/rejected": -120.56114196777344, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3143458366394043, + "rewards/margins": 27.301443099975586, + "rewards/rejected": -25.987096786499023, + "step": 5380 + }, + { + "epoch": 2.46, + "learning_rate": 6e-08, + "logits/chosen": -1.4424034357070923, + "logits/rejected": -1.3676373958587646, + "logps/chosen": -93.13945770263672, + "logps/rejected": -124.12135314941406, + "loss": 0.006, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4384705424308777, + "rewards/margins": 27.193965911865234, + "rewards/rejected": -26.755496978759766, + "step": 5390 + }, + { + "epoch": 2.46, + "learning_rate": 5.949281487743026e-08, + "logits/chosen": -1.4014787673950195, + "logits/rejected": -1.3362300395965576, + "logps/chosen": -88.39823150634766, + "logps/rejected": -119.18223571777344, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5621711611747742, + "rewards/margins": 27.4252872467041, + "rewards/rejected": -26.863113403320312, + "step": 5400 + }, + { + "epoch": 2.46, + "eval_logits/chosen": -1.5089606046676636, + "eval_logits/rejected": -1.4168381690979004, + "eval_logps/chosen": -88.5813217163086, + "eval_logps/rejected": -119.76658630371094, + "eval_loss": 0.00763333635404706, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.42555317282676697, + "eval_rewards/margins": 27.637985229492188, + "eval_rewards/rejected": -27.212432861328125, + "eval_runtime": 62.6634, + "eval_samples_per_second": 45.673, + "eval_steps_per_second": 2.857, + "step": 5400 + }, + { + "epoch": 2.47, + "learning_rate": 5.898562975486052e-08, + "logits/chosen": -1.4362767934799194, + "logits/rejected": -1.3550790548324585, + "logps/chosen": -89.90331268310547, + "logps/rejected": -124.781982421875, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.522152304649353, + "rewards/margins": 29.26023292541504, + "rewards/rejected": -27.738079071044922, + "step": 5410 + }, + { + "epoch": 2.47, + "learning_rate": 5.8478444632290784e-08, + "logits/chosen": -1.4165668487548828, + "logits/rejected": -1.343462586402893, + "logps/chosen": -90.41630554199219, + "logps/rejected": -127.66859436035156, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15914836525917053, + "rewards/margins": 29.304555892944336, + "rewards/rejected": -29.145410537719727, + "step": 5420 + }, + { + "epoch": 2.48, + "learning_rate": 5.7971259509721045e-08, + "logits/chosen": -1.3894436359405518, + "logits/rejected": -1.317000150680542, + "logps/chosen": -88.46882629394531, + "logps/rejected": -120.24174499511719, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4189438819885254, + "rewards/margins": 27.224233627319336, + "rewards/rejected": -26.805288314819336, + "step": 5430 + }, + { + "epoch": 2.48, + "learning_rate": 5.7464074387151306e-08, + "logits/chosen": -1.4001535177230835, + "logits/rejected": -1.3215277194976807, + "logps/chosen": -87.59601593017578, + "logps/rejected": -119.82057189941406, + "loss": 0.0011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6788640022277832, + "rewards/margins": 27.242944717407227, + "rewards/rejected": -26.5640811920166, + "step": 5440 + }, + { + "epoch": 2.49, + "learning_rate": 5.695688926458157e-08, + "logits/chosen": -1.4465348720550537, + "logits/rejected": -1.3477718830108643, + "logps/chosen": -89.69527435302734, + "logps/rejected": -118.54779052734375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.376774549484253, + "rewards/margins": 28.213756561279297, + "rewards/rejected": -25.836984634399414, + "step": 5450 + }, + { + "epoch": 2.49, + "learning_rate": 5.6449704142011834e-08, + "logits/chosen": -1.440271019935608, + "logits/rejected": -1.3544074296951294, + "logps/chosen": -85.5431900024414, + "logps/rejected": -122.612060546875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.067859172821045, + "rewards/margins": 29.92409896850586, + "rewards/rejected": -28.85623550415039, + "step": 5460 + }, + { + "epoch": 2.5, + "learning_rate": 5.5942519019442095e-08, + "logits/chosen": -1.39127779006958, + "logits/rejected": -1.325758695602417, + "logps/chosen": -83.56703186035156, + "logps/rejected": -124.7584457397461, + "loss": 0.0022, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6307471990585327, + "rewards/margins": 28.204599380493164, + "rewards/rejected": -27.5738468170166, + "step": 5470 + }, + { + "epoch": 2.5, + "learning_rate": 5.5435333896872356e-08, + "logits/chosen": -1.4168808460235596, + "logits/rejected": -1.3478556871414185, + "logps/chosen": -89.90568542480469, + "logps/rejected": -121.96623229980469, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0229051113128662, + "rewards/margins": 28.127361297607422, + "rewards/rejected": -27.10445785522461, + "step": 5480 + }, + { + "epoch": 2.51, + "learning_rate": 5.492814877430262e-08, + "logits/chosen": -1.432664394378662, + "logits/rejected": -1.359386920928955, + "logps/chosen": -86.13729858398438, + "logps/rejected": -120.99588775634766, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0249274969100952, + "rewards/margins": 28.6977481842041, + "rewards/rejected": -27.67281723022461, + "step": 5490 + }, + { + "epoch": 2.51, + "learning_rate": 5.442096365173288e-08, + "logits/chosen": -1.4274625778198242, + "logits/rejected": -1.3294579982757568, + "logps/chosen": -94.29920959472656, + "logps/rejected": -124.21602630615234, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37506765127182007, + "rewards/margins": 28.83444595336914, + "rewards/rejected": -28.45937728881836, + "step": 5500 + }, + { + "epoch": 2.51, + "eval_logits/chosen": -1.530104637145996, + "eval_logits/rejected": -1.4378955364227295, + "eval_logps/chosen": -88.8484115600586, + "eval_logps/rejected": -120.21839141845703, + "eval_loss": 0.007588541135191917, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.2920094132423401, + "eval_rewards/margins": 27.730342864990234, + "eval_rewards/rejected": -27.438335418701172, + "eval_runtime": 63.525, + "eval_samples_per_second": 45.053, + "eval_steps_per_second": 2.818, + "step": 5500 + }, + { + "epoch": 2.51, + "learning_rate": 5.3913778529163145e-08, + "logits/chosen": -1.4144002199172974, + "logits/rejected": -1.337476372718811, + "logps/chosen": -86.93864440917969, + "logps/rejected": -125.42852783203125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5241835117340088, + "rewards/margins": 29.53707504272461, + "rewards/rejected": -29.012889862060547, + "step": 5510 + }, + { + "epoch": 2.52, + "learning_rate": 5.3406593406593406e-08, + "logits/chosen": -1.3884155750274658, + "logits/rejected": -1.324973464012146, + "logps/chosen": -83.95980834960938, + "logps/rejected": -120.20356750488281, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15396860241889954, + "rewards/margins": 26.457748413085938, + "rewards/rejected": -26.61171531677246, + "step": 5520 + }, + { + "epoch": 2.52, + "learning_rate": 5.289940828402367e-08, + "logits/chosen": -1.4237573146820068, + "logits/rejected": -1.3433820009231567, + "logps/chosen": -94.42403411865234, + "logps/rejected": -125.9416732788086, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.3251895606517792, + "rewards/margins": 28.843847274780273, + "rewards/rejected": -28.51865577697754, + "step": 5530 + }, + { + "epoch": 2.53, + "learning_rate": 5.239222316145393e-08, + "logits/chosen": -1.4355299472808838, + "logits/rejected": -1.363010048866272, + "logps/chosen": -88.18751525878906, + "logps/rejected": -125.2325668334961, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37160930037498474, + "rewards/margins": 28.638696670532227, + "rewards/rejected": -28.26708984375, + "step": 5540 + }, + { + "epoch": 2.53, + "learning_rate": 5.188503803888419e-08, + "logits/chosen": -1.4135535955429077, + "logits/rejected": -1.331364631652832, + "logps/chosen": -94.18409729003906, + "logps/rejected": -123.58796691894531, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.10003487765789032, + "rewards/margins": 28.183786392211914, + "rewards/rejected": -28.083749771118164, + "step": 5550 + }, + { + "epoch": 2.54, + "learning_rate": 5.137785291631445e-08, + "logits/chosen": -1.4515702724456787, + "logits/rejected": -1.3703655004501343, + "logps/chosen": -86.55001068115234, + "logps/rejected": -129.2012481689453, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22907297313213348, + "rewards/margins": 29.383153915405273, + "rewards/rejected": -29.15407943725586, + "step": 5560 + }, + { + "epoch": 2.54, + "learning_rate": 5.087066779374472e-08, + "logits/chosen": -1.3880281448364258, + "logits/rejected": -1.3039556741714478, + "logps/chosen": -93.82992553710938, + "logps/rejected": -118.81803131103516, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1777141094207764, + "rewards/margins": 27.936527252197266, + "rewards/rejected": -26.758813858032227, + "step": 5570 + }, + { + "epoch": 2.55, + "learning_rate": 5.036348267117498e-08, + "logits/chosen": -1.4143790006637573, + "logits/rejected": -1.3327438831329346, + "logps/chosen": -90.89906311035156, + "logps/rejected": -125.50030517578125, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.46656179428100586, + "rewards/margins": 29.756689071655273, + "rewards/rejected": -29.290124893188477, + "step": 5580 + }, + { + "epoch": 2.55, + "learning_rate": 4.985629754860524e-08, + "logits/chosen": -1.3946322202682495, + "logits/rejected": -1.3158382177352905, + "logps/chosen": -93.13166809082031, + "logps/rejected": -120.45831298828125, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6907525062561035, + "rewards/margins": 27.038888931274414, + "rewards/rejected": -27.72964096069336, + "step": 5590 + }, + { + "epoch": 2.56, + "learning_rate": 4.93491124260355e-08, + "logits/chosen": -1.444240927696228, + "logits/rejected": -1.3477418422698975, + "logps/chosen": -92.03813171386719, + "logps/rejected": -123.6197280883789, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03295283392071724, + "rewards/margins": 28.512805938720703, + "rewards/rejected": -28.479854583740234, + "step": 5600 + }, + { + "epoch": 2.56, + "eval_logits/chosen": -1.5288245677947998, + "eval_logits/rejected": -1.4369823932647705, + "eval_logps/chosen": -89.01703643798828, + "eval_logps/rejected": -120.62100982666016, + "eval_loss": 0.007766298484057188, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.2076922357082367, + "eval_rewards/margins": 27.84734535217285, + "eval_rewards/rejected": -27.639650344848633, + "eval_runtime": 65.4543, + "eval_samples_per_second": 43.725, + "eval_steps_per_second": 2.735, + "step": 5600 + }, + { + "epoch": 2.56, + "learning_rate": 4.884192730346576e-08, + "logits/chosen": -1.4230222702026367, + "logits/rejected": -1.3513391017913818, + "logps/chosen": -90.27880096435547, + "logps/rejected": -119.94172668457031, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44236135482788086, + "rewards/margins": 28.575183868408203, + "rewards/rejected": -28.132823944091797, + "step": 5610 + }, + { + "epoch": 2.57, + "learning_rate": 4.833474218089602e-08, + "logits/chosen": -1.4552013874053955, + "logits/rejected": -1.3836065530776978, + "logps/chosen": -87.5635986328125, + "logps/rejected": -122.162353515625, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6596953868865967, + "rewards/margins": 26.969858169555664, + "rewards/rejected": -28.629552841186523, + "step": 5620 + }, + { + "epoch": 2.57, + "learning_rate": 4.782755705832629e-08, + "logits/chosen": -1.4242037534713745, + "logits/rejected": -1.3452680110931396, + "logps/chosen": -88.09815979003906, + "logps/rejected": -124.96726989746094, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3257520198822021, + "rewards/margins": 29.73336410522461, + "rewards/rejected": -28.407611846923828, + "step": 5630 + }, + { + "epoch": 2.57, + "learning_rate": 4.732037193575655e-08, + "logits/chosen": -1.4236841201782227, + "logits/rejected": -1.3491275310516357, + "logps/chosen": -91.39635467529297, + "logps/rejected": -121.4232406616211, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.2293356955051422, + "rewards/margins": 27.425186157226562, + "rewards/rejected": -27.195850372314453, + "step": 5640 + }, + { + "epoch": 2.58, + "learning_rate": 4.681318681318681e-08, + "logits/chosen": -1.4207613468170166, + "logits/rejected": -1.3308521509170532, + "logps/chosen": -91.08123779296875, + "logps/rejected": -118.93770599365234, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.274003028869629, + "rewards/margins": 28.229358673095703, + "rewards/rejected": -26.955352783203125, + "step": 5650 + }, + { + "epoch": 2.58, + "learning_rate": 4.630600169061707e-08, + "logits/chosen": -1.4237556457519531, + "logits/rejected": -1.3543760776519775, + "logps/chosen": -85.66486358642578, + "logps/rejected": -124.79557037353516, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14971943199634552, + "rewards/margins": 28.228811264038086, + "rewards/rejected": -28.079092025756836, + "step": 5660 + }, + { + "epoch": 2.59, + "learning_rate": 4.579881656804733e-08, + "logits/chosen": -1.452664852142334, + "logits/rejected": -1.3631222248077393, + "logps/chosen": -93.4820556640625, + "logps/rejected": -123.2723617553711, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7186456918716431, + "rewards/margins": 28.007343292236328, + "rewards/rejected": -28.72598648071289, + "step": 5670 + }, + { + "epoch": 2.59, + "learning_rate": 4.5291631445477594e-08, + "logits/chosen": -1.4738563299179077, + "logits/rejected": -1.396863341331482, + "logps/chosen": -85.08771514892578, + "logps/rejected": -125.86419677734375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7788127660751343, + "rewards/margins": 29.58379554748535, + "rewards/rejected": -28.804983139038086, + "step": 5680 + }, + { + "epoch": 2.6, + "learning_rate": 4.478444632290786e-08, + "logits/chosen": -1.4112271070480347, + "logits/rejected": -1.342129111289978, + "logps/chosen": -93.12382507324219, + "logps/rejected": -132.10922241210938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42668676376342773, + "rewards/margins": 29.515426635742188, + "rewards/rejected": -29.0887393951416, + "step": 5690 + }, + { + "epoch": 2.6, + "learning_rate": 4.427726120033812e-08, + "logits/chosen": -1.4182324409484863, + "logits/rejected": -1.3355042934417725, + "logps/chosen": -95.10716247558594, + "logps/rejected": -121.40716552734375, + "loss": 0.0012, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.24203085899353027, + "rewards/margins": 27.118215560913086, + "rewards/rejected": -27.360248565673828, + "step": 5700 + }, + { + "epoch": 2.6, + "eval_logits/chosen": -1.521308422088623, + "eval_logits/rejected": -1.4287875890731812, + "eval_logps/chosen": -88.81271362304688, + "eval_logps/rejected": -120.16915893554688, + "eval_loss": 0.007557415869086981, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.3098601996898651, + "eval_rewards/margins": 27.723575592041016, + "eval_rewards/rejected": -27.413715362548828, + "eval_runtime": 64.7947, + "eval_samples_per_second": 44.17, + "eval_steps_per_second": 2.763, + "step": 5700 + }, + { + "epoch": 2.61, + "learning_rate": 4.377007607776838e-08, + "logits/chosen": -1.4278671741485596, + "logits/rejected": -1.3431296348571777, + "logps/chosen": -88.24993133544922, + "logps/rejected": -121.37858581542969, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6779876947402954, + "rewards/margins": 29.29281997680664, + "rewards/rejected": -27.614831924438477, + "step": 5710 + }, + { + "epoch": 2.61, + "learning_rate": 4.3262890955198644e-08, + "logits/chosen": -1.3866733312606812, + "logits/rejected": -1.3263394832611084, + "logps/chosen": -89.70895385742188, + "logps/rejected": -118.44575500488281, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36240631341934204, + "rewards/margins": 26.25311851501465, + "rewards/rejected": -25.89071273803711, + "step": 5720 + }, + { + "epoch": 2.62, + "learning_rate": 4.2755705832628905e-08, + "logits/chosen": -1.4577974081039429, + "logits/rejected": -1.3561092615127563, + "logps/chosen": -90.57540893554688, + "logps/rejected": -122.50830078125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9288503527641296, + "rewards/margins": 29.591562271118164, + "rewards/rejected": -28.662710189819336, + "step": 5730 + }, + { + "epoch": 2.62, + "learning_rate": 4.2248520710059166e-08, + "logits/chosen": -1.430964708328247, + "logits/rejected": -1.362330675125122, + "logps/chosen": -93.49796295166016, + "logps/rejected": -127.06739807128906, + "loss": 0.0033, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.5318681001663208, + "rewards/margins": 28.357601165771484, + "rewards/rejected": -27.82573890686035, + "step": 5740 + }, + { + "epoch": 2.62, + "learning_rate": 4.1741335587489433e-08, + "logits/chosen": -1.4357731342315674, + "logits/rejected": -1.3574306964874268, + "logps/chosen": -88.33089447021484, + "logps/rejected": -124.7070083618164, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2313622683286667, + "rewards/margins": 29.021398544311523, + "rewards/rejected": -28.790035247802734, + "step": 5750 + }, + { + "epoch": 2.63, + "learning_rate": 4.1234150464919694e-08, + "logits/chosen": -1.4213526248931885, + "logits/rejected": -1.364757776260376, + "logps/chosen": -81.57777404785156, + "logps/rejected": -119.826171875, + "loss": 0.0034, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9536989331245422, + "rewards/margins": 27.143407821655273, + "rewards/rejected": -26.189708709716797, + "step": 5760 + }, + { + "epoch": 2.63, + "learning_rate": 4.0726965342349955e-08, + "logits/chosen": -1.4012161493301392, + "logits/rejected": -1.3225289583206177, + "logps/chosen": -91.90386199951172, + "logps/rejected": -119.8282241821289, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7034690380096436, + "rewards/margins": 28.835796356201172, + "rewards/rejected": -27.132328033447266, + "step": 5770 + }, + { + "epoch": 2.64, + "learning_rate": 4.0219780219780216e-08, + "logits/chosen": -1.4080169200897217, + "logits/rejected": -1.338749647140503, + "logps/chosen": -94.41007995605469, + "logps/rejected": -125.92796325683594, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.140974998474121, + "rewards/margins": 26.483749389648438, + "rewards/rejected": -25.342771530151367, + "step": 5780 + }, + { + "epoch": 2.64, + "learning_rate": 3.971259509721048e-08, + "logits/chosen": -1.4085487127304077, + "logits/rejected": -1.327182650566101, + "logps/chosen": -88.75342559814453, + "logps/rejected": -119.1405029296875, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0149381160736084, + "rewards/margins": 27.902973175048828, + "rewards/rejected": -26.888031005859375, + "step": 5790 + }, + { + "epoch": 2.65, + "learning_rate": 3.920540997464074e-08, + "logits/chosen": -1.433281421661377, + "logits/rejected": -1.3512113094329834, + "logps/chosen": -90.19625854492188, + "logps/rejected": -126.81318664550781, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6478980779647827, + "rewards/margins": 29.77631187438965, + "rewards/rejected": -29.12841796875, + "step": 5800 + }, + { + "epoch": 2.65, + "eval_logits/chosen": -1.5241363048553467, + "eval_logits/rejected": -1.4326359033584595, + "eval_logps/chosen": -88.73265838623047, + "eval_logps/rejected": -120.24764251708984, + "eval_loss": 0.007415792904794216, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.3498854637145996, + "eval_rewards/margins": 27.802854537963867, + "eval_rewards/rejected": -27.452966690063477, + "eval_runtime": 66.3571, + "eval_samples_per_second": 43.13, + "eval_steps_per_second": 2.698, + "step": 5800 + }, + { + "epoch": 2.65, + "learning_rate": 3.8698224852071005e-08, + "logits/chosen": -1.3824363946914673, + "logits/rejected": -1.3102766275405884, + "logps/chosen": -90.2943115234375, + "logps/rejected": -120.10107421875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4802871346473694, + "rewards/margins": 27.660263061523438, + "rewards/rejected": -27.179973602294922, + "step": 5810 + }, + { + "epoch": 2.66, + "learning_rate": 3.8191039729501266e-08, + "logits/chosen": -1.405815601348877, + "logits/rejected": -1.3217995166778564, + "logps/chosen": -83.80717468261719, + "logps/rejected": -120.4073715209961, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3315509557724, + "rewards/margins": 28.07989501953125, + "rewards/rejected": -26.748342514038086, + "step": 5820 + }, + { + "epoch": 2.66, + "learning_rate": 3.768385460693153e-08, + "logits/chosen": -1.400803565979004, + "logits/rejected": -1.3341869115829468, + "logps/chosen": -85.88481140136719, + "logps/rejected": -122.70805358886719, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.08723556995391846, + "rewards/margins": 27.86008071899414, + "rewards/rejected": -27.947315216064453, + "step": 5830 + }, + { + "epoch": 2.67, + "learning_rate": 3.717666948436179e-08, + "logits/chosen": -1.4672653675079346, + "logits/rejected": -1.3654979467391968, + "logps/chosen": -88.60813903808594, + "logps/rejected": -124.53550720214844, + "loss": 0.0013, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0332581996917725, + "rewards/margins": 29.575796127319336, + "rewards/rejected": -28.54253578186035, + "step": 5840 + }, + { + "epoch": 2.67, + "learning_rate": 3.6669484361792056e-08, + "logits/chosen": -1.4773852825164795, + "logits/rejected": -1.3942975997924805, + "logps/chosen": -84.61897277832031, + "logps/rejected": -118.28257751464844, + "loss": 0.0059, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.075499415397644, + "rewards/margins": 27.52908706665039, + "rewards/rejected": -26.45358657836914, + "step": 5850 + }, + { + "epoch": 2.67, + "learning_rate": 3.6162299239222316e-08, + "logits/chosen": -1.4367072582244873, + "logits/rejected": -1.3531558513641357, + "logps/chosen": -92.48080444335938, + "logps/rejected": -124.60569763183594, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9651349782943726, + "rewards/margins": 30.93218421936035, + "rewards/rejected": -28.9670467376709, + "step": 5860 + }, + { + "epoch": 2.68, + "learning_rate": 3.565511411665258e-08, + "logits/chosen": -1.4036922454833984, + "logits/rejected": -1.3271372318267822, + "logps/chosen": -91.8263168334961, + "logps/rejected": -123.1822509765625, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49861449003219604, + "rewards/margins": 27.064804077148438, + "rewards/rejected": -26.56618881225586, + "step": 5870 + }, + { + "epoch": 2.68, + "learning_rate": 3.514792899408284e-08, + "logits/chosen": -1.3846232891082764, + "logits/rejected": -1.3122532367706299, + "logps/chosen": -84.51948547363281, + "logps/rejected": -121.88045501708984, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.880649209022522, + "rewards/margins": 28.404842376708984, + "rewards/rejected": -27.524194717407227, + "step": 5880 + }, + { + "epoch": 2.69, + "learning_rate": 3.46407438715131e-08, + "logits/chosen": -1.4396214485168457, + "logits/rejected": -1.3636229038238525, + "logps/chosen": -93.21275329589844, + "logps/rejected": -124.11541748046875, + "loss": 0.0087, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.49375027418136597, + "rewards/margins": 28.062625885009766, + "rewards/rejected": -27.568878173828125, + "step": 5890 + }, + { + "epoch": 2.69, + "learning_rate": 3.413355874894336e-08, + "logits/chosen": -1.4009182453155518, + "logits/rejected": -1.3155372142791748, + "logps/chosen": -90.7492446899414, + "logps/rejected": -121.24101257324219, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.570563018321991, + "rewards/margins": 28.15814781188965, + "rewards/rejected": -27.587589263916016, + "step": 5900 + }, + { + "epoch": 2.69, + "eval_logits/chosen": -1.5298057794570923, + "eval_logits/rejected": -1.4371873140335083, + "eval_logps/chosen": -88.87010955810547, + "eval_logps/rejected": -120.50233459472656, + "eval_loss": 0.007439317647367716, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.28115907311439514, + "eval_rewards/margins": 27.861465454101562, + "eval_rewards/rejected": -27.580307006835938, + "eval_runtime": 61.2948, + "eval_samples_per_second": 46.692, + "eval_steps_per_second": 2.92, + "step": 5900 + }, + { + "epoch": 2.7, + "learning_rate": 3.362637362637363e-08, + "logits/chosen": -1.386823058128357, + "logits/rejected": -1.317825198173523, + "logps/chosen": -88.96937561035156, + "logps/rejected": -118.41639709472656, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38685184717178345, + "rewards/margins": 26.644840240478516, + "rewards/rejected": -26.257986068725586, + "step": 5910 + }, + { + "epoch": 2.7, + "learning_rate": 3.311918850380389e-08, + "logits/chosen": -1.4166361093521118, + "logits/rejected": -1.3327529430389404, + "logps/chosen": -91.54283905029297, + "logps/rejected": -125.4769515991211, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6538678407669067, + "rewards/margins": 29.736963272094727, + "rewards/rejected": -29.083093643188477, + "step": 5920 + }, + { + "epoch": 2.71, + "learning_rate": 3.261200338123415e-08, + "logits/chosen": -1.4290189743041992, + "logits/rejected": -1.3503470420837402, + "logps/chosen": -93.33061218261719, + "logps/rejected": -120.43190002441406, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5092540979385376, + "rewards/margins": 27.29727554321289, + "rewards/rejected": -25.788021087646484, + "step": 5930 + }, + { + "epoch": 2.71, + "learning_rate": 3.210481825866441e-08, + "logits/chosen": -1.4096345901489258, + "logits/rejected": -1.3365631103515625, + "logps/chosen": -92.1152114868164, + "logps/rejected": -129.61373901367188, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6155846118927002, + "rewards/margins": 27.218673706054688, + "rewards/rejected": -27.83425521850586, + "step": 5940 + }, + { + "epoch": 2.72, + "learning_rate": 3.159763313609467e-08, + "logits/chosen": -1.4408024549484253, + "logits/rejected": -1.358703851699829, + "logps/chosen": -92.8883056640625, + "logps/rejected": -127.37141418457031, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4508030414581299, + "rewards/margins": 29.662633895874023, + "rewards/rejected": -28.211828231811523, + "step": 5950 + }, + { + "epoch": 2.72, + "learning_rate": 3.109044801352493e-08, + "logits/chosen": -1.4268407821655273, + "logits/rejected": -1.3649609088897705, + "logps/chosen": -91.47252655029297, + "logps/rejected": -127.54573059082031, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9097731709480286, + "rewards/margins": 28.423721313476562, + "rewards/rejected": -27.51395034790039, + "step": 5960 + }, + { + "epoch": 2.72, + "learning_rate": 3.05832628909552e-08, + "logits/chosen": -1.427631139755249, + "logits/rejected": -1.349169373512268, + "logps/chosen": -88.80760192871094, + "logps/rejected": -119.60186767578125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3762190341949463, + "rewards/margins": 28.201892852783203, + "rewards/rejected": -26.825674057006836, + "step": 5970 + }, + { + "epoch": 2.73, + "learning_rate": 3.007607776838546e-08, + "logits/chosen": -1.433571219444275, + "logits/rejected": -1.3482530117034912, + "logps/chosen": -92.32679748535156, + "logps/rejected": -124.57658386230469, + "loss": 0.0011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.08425948768854141, + "rewards/margins": 28.501611709594727, + "rewards/rejected": -28.4173526763916, + "step": 5980 + }, + { + "epoch": 2.73, + "learning_rate": 2.956889264581572e-08, + "logits/chosen": -1.4247334003448486, + "logits/rejected": -1.3412024974822998, + "logps/chosen": -97.79167938232422, + "logps/rejected": -129.82369995117188, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.341404676437378, + "rewards/margins": 30.7274169921875, + "rewards/rejected": -29.38601303100586, + "step": 5990 + }, + { + "epoch": 2.74, + "learning_rate": 2.9061707523245986e-08, + "logits/chosen": -1.4527391195297241, + "logits/rejected": -1.3852466344833374, + "logps/chosen": -84.54109191894531, + "logps/rejected": -123.23868560791016, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4699872136116028, + "rewards/margins": 28.9207706451416, + "rewards/rejected": -28.450780868530273, + "step": 6000 + }, + { + "epoch": 2.74, + "eval_logits/chosen": -1.5361018180847168, + "eval_logits/rejected": -1.4470702409744263, + "eval_logps/chosen": -88.81427764892578, + "eval_logps/rejected": -120.10942077636719, + "eval_loss": 0.0073595428839325905, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": 0.30907416343688965, + "eval_rewards/margins": 27.692928314208984, + "eval_rewards/rejected": -27.383853912353516, + "eval_runtime": 66.1237, + "eval_samples_per_second": 43.283, + "eval_steps_per_second": 2.707, + "step": 6000 + }, + { + "epoch": 2.74, + "learning_rate": 2.8554522400676247e-08, + "logits/chosen": -1.437772512435913, + "logits/rejected": -1.3702958822250366, + "logps/chosen": -88.56996154785156, + "logps/rejected": -120.36534118652344, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8493738174438477, + "rewards/margins": 25.939929962158203, + "rewards/rejected": -26.789306640625, + "step": 6010 + }, + { + "epoch": 2.75, + "learning_rate": 2.8047337278106507e-08, + "logits/chosen": -1.3986080884933472, + "logits/rejected": -1.3344132900238037, + "logps/chosen": -87.32596588134766, + "logps/rejected": -124.3438949584961, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7483171820640564, + "rewards/margins": 28.974384307861328, + "rewards/rejected": -28.2260684967041, + "step": 6020 + }, + { + "epoch": 2.75, + "learning_rate": 2.754015215553677e-08, + "logits/chosen": -1.4268220663070679, + "logits/rejected": -1.3452110290527344, + "logps/chosen": -91.21931457519531, + "logps/rejected": -122.6042709350586, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31938114762306213, + "rewards/margins": 27.742395401000977, + "rewards/rejected": -27.423009872436523, + "step": 6030 + }, + { + "epoch": 2.76, + "learning_rate": 2.7032967032967033e-08, + "logits/chosen": -1.4078203439712524, + "logits/rejected": -1.341101884841919, + "logps/chosen": -88.9620132446289, + "logps/rejected": -126.86651611328125, + "loss": 0.0043, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.03730924054980278, + "rewards/margins": 28.27874755859375, + "rewards/rejected": -28.24143409729004, + "step": 6040 + }, + { + "epoch": 2.76, + "learning_rate": 2.6525781910397293e-08, + "logits/chosen": -1.4218459129333496, + "logits/rejected": -1.3417071104049683, + "logps/chosen": -94.39894104003906, + "logps/rejected": -126.17645263671875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41776663064956665, + "rewards/margins": 27.610363006591797, + "rewards/rejected": -27.192596435546875, + "step": 6050 + }, + { + "epoch": 2.77, + "learning_rate": 2.6018596787827558e-08, + "logits/chosen": -1.3829963207244873, + "logits/rejected": -1.3349157571792603, + "logps/chosen": -81.25392150878906, + "logps/rejected": -116.95680236816406, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4236263334751129, + "rewards/margins": 25.6158447265625, + "rewards/rejected": -25.192218780517578, + "step": 6060 + }, + { + "epoch": 2.77, + "learning_rate": 2.551141166525782e-08, + "logits/chosen": -1.4441430568695068, + "logits/rejected": -1.3542683124542236, + "logps/chosen": -94.20352172851562, + "logps/rejected": -124.4933090209961, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7039328813552856, + "rewards/margins": 27.073434829711914, + "rewards/rejected": -26.369503021240234, + "step": 6070 + }, + { + "epoch": 2.77, + "learning_rate": 2.500422654268808e-08, + "logits/chosen": -1.4258487224578857, + "logits/rejected": -1.357102632522583, + "logps/chosen": -86.0672607421875, + "logps/rejected": -125.6698226928711, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5518980026245117, + "rewards/margins": 29.156414031982422, + "rewards/rejected": -28.604516983032227, + "step": 6080 + }, + { + "epoch": 2.78, + "learning_rate": 2.4497041420118344e-08, + "logits/chosen": -1.44197678565979, + "logits/rejected": -1.3703984022140503, + "logps/chosen": -84.29279327392578, + "logps/rejected": -122.15272521972656, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0316746234893799, + "rewards/margins": 28.804452896118164, + "rewards/rejected": -27.772777557373047, + "step": 6090 + }, + { + "epoch": 2.78, + "learning_rate": 2.3989856297548605e-08, + "logits/chosen": -1.4140945672988892, + "logits/rejected": -1.3373467922210693, + "logps/chosen": -92.00602722167969, + "logps/rejected": -121.0610122680664, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4240033030509949, + "rewards/margins": 27.450969696044922, + "rewards/rejected": -27.026966094970703, + "step": 6100 + }, + { + "epoch": 2.78, + "eval_logits/chosen": -1.528950810432434, + "eval_logits/rejected": -1.4393686056137085, + "eval_logps/chosen": -88.54615783691406, + "eval_logps/rejected": -119.571533203125, + "eval_loss": 0.007185075432062149, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.4431401193141937, + "eval_rewards/margins": 27.55805015563965, + "eval_rewards/rejected": -27.114912033081055, + "eval_runtime": 67.6666, + "eval_samples_per_second": 42.296, + "eval_steps_per_second": 2.645, + "step": 6100 + }, + { + "epoch": 2.79, + "learning_rate": 2.3482671174978865e-08, + "logits/chosen": -1.3959143161773682, + "logits/rejected": -1.329087495803833, + "logps/chosen": -90.89431762695312, + "logps/rejected": -123.38334655761719, + "loss": 0.0033, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.0483791828155518, + "rewards/margins": 25.339763641357422, + "rewards/rejected": -26.388141632080078, + "step": 6110 + }, + { + "epoch": 2.79, + "learning_rate": 2.297548605240913e-08, + "logits/chosen": -1.4264185428619385, + "logits/rejected": -1.3347035646438599, + "logps/chosen": -95.05891418457031, + "logps/rejected": -122.20552825927734, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.40551090240478516, + "rewards/margins": 27.485652923583984, + "rewards/rejected": -27.080142974853516, + "step": 6120 + }, + { + "epoch": 2.8, + "learning_rate": 2.246830092983939e-08, + "logits/chosen": -1.4183063507080078, + "logits/rejected": -1.3475987911224365, + "logps/chosen": -90.90924835205078, + "logps/rejected": -123.7392349243164, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.11213073879480362, + "rewards/margins": 26.713592529296875, + "rewards/rejected": -26.82572364807129, + "step": 6130 + }, + { + "epoch": 2.8, + "learning_rate": 2.196111580726965e-08, + "logits/chosen": -1.3835358619689941, + "logits/rejected": -1.304900884628296, + "logps/chosen": -94.07560729980469, + "logps/rejected": -119.58497619628906, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7558839321136475, + "rewards/margins": 26.48470115661621, + "rewards/rejected": -25.72881507873535, + "step": 6140 + }, + { + "epoch": 2.81, + "learning_rate": 2.1453930684699916e-08, + "logits/chosen": -1.4276055097579956, + "logits/rejected": -1.356466293334961, + "logps/chosen": -92.86775207519531, + "logps/rejected": -124.24592590332031, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.07010960578918457, + "rewards/margins": 28.186676025390625, + "rewards/rejected": -28.25678062438965, + "step": 6150 + }, + { + "epoch": 2.81, + "learning_rate": 2.0946745562130177e-08, + "logits/chosen": -1.3897449970245361, + "logits/rejected": -1.3085477352142334, + "logps/chosen": -89.38758850097656, + "logps/rejected": -123.3631820678711, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.502859890460968, + "rewards/margins": 29.507598876953125, + "rewards/rejected": -29.00473976135254, + "step": 6160 + }, + { + "epoch": 2.82, + "learning_rate": 2.0439560439560437e-08, + "logits/chosen": -1.407470464706421, + "logits/rejected": -1.3406095504760742, + "logps/chosen": -93.42985534667969, + "logps/rejected": -124.22257995605469, + "loss": 0.0011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7225341200828552, + "rewards/margins": 28.852397918701172, + "rewards/rejected": -28.12986183166504, + "step": 6170 + }, + { + "epoch": 2.82, + "learning_rate": 1.99323753169907e-08, + "logits/chosen": -1.3827449083328247, + "logits/rejected": -1.3094285726547241, + "logps/chosen": -93.71525573730469, + "logps/rejected": -119.5619125366211, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6188391447067261, + "rewards/margins": 26.20868492126465, + "rewards/rejected": -25.58984375, + "step": 6180 + }, + { + "epoch": 2.83, + "learning_rate": 1.9425190194420963e-08, + "logits/chosen": -1.3776720762252808, + "logits/rejected": -1.3083680868148804, + "logps/chosen": -91.71514129638672, + "logps/rejected": -120.59306335449219, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6132274270057678, + "rewards/margins": 26.796030044555664, + "rewards/rejected": -27.409259796142578, + "step": 6190 + }, + { + "epoch": 2.83, + "learning_rate": 1.8918005071851223e-08, + "logits/chosen": -1.4177929162979126, + "logits/rejected": -1.3377645015716553, + "logps/chosen": -87.91746520996094, + "logps/rejected": -118.75276184082031, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8456829786300659, + "rewards/margins": 27.677001953125, + "rewards/rejected": -26.831317901611328, + "step": 6200 + }, + { + "epoch": 2.83, + "eval_logits/chosen": -1.5227910280227661, + "eval_logits/rejected": -1.4297844171524048, + "eval_logps/chosen": -88.48534393310547, + "eval_logps/rejected": -119.60254669189453, + "eval_loss": 0.00707243150100112, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.47354137897491455, + "eval_rewards/margins": 27.603960037231445, + "eval_rewards/rejected": -27.130416870117188, + "eval_runtime": 65.0475, + "eval_samples_per_second": 43.999, + "eval_steps_per_second": 2.752, + "step": 6200 + }, + { + "epoch": 2.83, + "learning_rate": 1.8410819949281488e-08, + "logits/chosen": -1.4211013317108154, + "logits/rejected": -1.336397409439087, + "logps/chosen": -89.54491424560547, + "logps/rejected": -122.1443862915039, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5423094034194946, + "rewards/margins": 28.126476287841797, + "rewards/rejected": -26.58416748046875, + "step": 6210 + }, + { + "epoch": 2.84, + "learning_rate": 1.790363482671175e-08, + "logits/chosen": -1.4224960803985596, + "logits/rejected": -1.3450286388397217, + "logps/chosen": -91.13011169433594, + "logps/rejected": -124.55561828613281, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6848041415214539, + "rewards/margins": 27.732412338256836, + "rewards/rejected": -27.047607421875, + "step": 6220 + }, + { + "epoch": 2.84, + "learning_rate": 1.7396449704142013e-08, + "logits/chosen": -1.4057667255401611, + "logits/rejected": -1.3270689249038696, + "logps/chosen": -94.62673950195312, + "logps/rejected": -121.52132415771484, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.3413497507572174, + "rewards/margins": 27.458669662475586, + "rewards/rejected": -27.117321014404297, + "step": 6230 + }, + { + "epoch": 2.85, + "learning_rate": 1.6889264581572274e-08, + "logits/chosen": -1.433934211730957, + "logits/rejected": -1.343518614768982, + "logps/chosen": -90.16001892089844, + "logps/rejected": -122.25347900390625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.359809547662735, + "rewards/margins": 28.131118774414062, + "rewards/rejected": -27.771310806274414, + "step": 6240 + }, + { + "epoch": 2.85, + "learning_rate": 1.6382079459002535e-08, + "logits/chosen": -1.4110838174819946, + "logits/rejected": -1.3541514873504639, + "logps/chosen": -91.34024810791016, + "logps/rejected": -128.32180786132812, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.36720380187034607, + "rewards/margins": 27.321441650390625, + "rewards/rejected": -27.688648223876953, + "step": 6250 + }, + { + "epoch": 2.86, + "learning_rate": 1.58748943364328e-08, + "logits/chosen": -1.389146089553833, + "logits/rejected": -1.3081094026565552, + "logps/chosen": -88.67632293701172, + "logps/rejected": -115.4472885131836, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8905091285705566, + "rewards/margins": 27.710601806640625, + "rewards/rejected": -26.82008934020996, + "step": 6260 + }, + { + "epoch": 2.86, + "learning_rate": 1.536770921386306e-08, + "logits/chosen": -1.3963115215301514, + "logits/rejected": -1.303027868270874, + "logps/chosen": -94.36498260498047, + "logps/rejected": -124.65643310546875, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32212838530540466, + "rewards/margins": 29.300546646118164, + "rewards/rejected": -28.97841453552246, + "step": 6270 + }, + { + "epoch": 2.87, + "learning_rate": 1.486052409129332e-08, + "logits/chosen": -1.4071121215820312, + "logits/rejected": -1.324487566947937, + "logps/chosen": -91.63459777832031, + "logps/rejected": -124.7200698852539, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18130116164684296, + "rewards/margins": 28.662633895874023, + "rewards/rejected": -28.481334686279297, + "step": 6280 + }, + { + "epoch": 2.87, + "learning_rate": 1.4353338968723583e-08, + "logits/chosen": -1.3938955068588257, + "logits/rejected": -1.3304332494735718, + "logps/chosen": -86.27269744873047, + "logps/rejected": -119.1937255859375, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13392703235149384, + "rewards/margins": 26.717966079711914, + "rewards/rejected": -26.584041595458984, + "step": 6290 + }, + { + "epoch": 2.88, + "learning_rate": 1.3846153846153846e-08, + "logits/chosen": -1.4227640628814697, + "logits/rejected": -1.3536622524261475, + "logps/chosen": -89.34703063964844, + "logps/rejected": -123.28599548339844, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.276293992996216, + "rewards/margins": 29.977630615234375, + "rewards/rejected": -27.701339721679688, + "step": 6300 + }, + { + "epoch": 2.88, + "eval_logits/chosen": -1.5174129009246826, + "eval_logits/rejected": -1.4249118566513062, + "eval_logps/chosen": -88.27791595458984, + "eval_logps/rejected": -119.14039611816406, + "eval_loss": 0.007073402404785156, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.5772566795349121, + "eval_rewards/margins": 27.476600646972656, + "eval_rewards/rejected": -26.89933967590332, + "eval_runtime": 65.0621, + "eval_samples_per_second": 43.989, + "eval_steps_per_second": 2.751, + "step": 6300 + }, + { + "epoch": 2.88, + "learning_rate": 1.3338968723584107e-08, + "logits/chosen": -1.4048776626586914, + "logits/rejected": -1.3209692239761353, + "logps/chosen": -94.9172592163086, + "logps/rejected": -121.06196594238281, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9303966760635376, + "rewards/margins": 27.744037628173828, + "rewards/rejected": -26.813640594482422, + "step": 6310 + }, + { + "epoch": 2.88, + "learning_rate": 1.2831783601014369e-08, + "logits/chosen": -1.4251675605773926, + "logits/rejected": -1.3501653671264648, + "logps/chosen": -87.30450439453125, + "logps/rejected": -124.9731674194336, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17765673995018005, + "rewards/margins": 27.48089599609375, + "rewards/rejected": -27.303241729736328, + "step": 6320 + }, + { + "epoch": 2.89, + "learning_rate": 1.2324598478444632e-08, + "logits/chosen": -1.3993405103683472, + "logits/rejected": -1.3213218450546265, + "logps/chosen": -89.23213958740234, + "logps/rejected": -119.50053405761719, + "loss": 0.0077, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9308843612670898, + "rewards/margins": 29.413715362548828, + "rewards/rejected": -28.482830047607422, + "step": 6330 + }, + { + "epoch": 2.89, + "learning_rate": 1.1817413355874893e-08, + "logits/chosen": -1.3939188718795776, + "logits/rejected": -1.326827883720398, + "logps/chosen": -90.73262786865234, + "logps/rejected": -124.04630279541016, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2764853835105896, + "rewards/margins": 27.666738510131836, + "rewards/rejected": -27.390254974365234, + "step": 6340 + }, + { + "epoch": 2.9, + "learning_rate": 1.1310228233305155e-08, + "logits/chosen": -1.3985464572906494, + "logits/rejected": -1.3156406879425049, + "logps/chosen": -86.709716796875, + "logps/rejected": -120.27693176269531, + "loss": 0.0025, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8232513666152954, + "rewards/margins": 27.919193267822266, + "rewards/rejected": -27.0959415435791, + "step": 6350 + }, + { + "epoch": 2.9, + "learning_rate": 1.080304311073542e-08, + "logits/chosen": -1.3861408233642578, + "logits/rejected": -1.3142074346542358, + "logps/chosen": -89.95799255371094, + "logps/rejected": -118.38623046875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4884513318538666, + "rewards/margins": 26.449548721313477, + "rewards/rejected": -26.937997817993164, + "step": 6360 + }, + { + "epoch": 2.91, + "learning_rate": 1.0295857988165679e-08, + "logits/chosen": -1.4367835521697998, + "logits/rejected": -1.3561252355575562, + "logps/chosen": -91.15764617919922, + "logps/rejected": -122.2125473022461, + "loss": 0.0087, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.2589377164840698, + "rewards/margins": 28.840377807617188, + "rewards/rejected": -27.581439971923828, + "step": 6370 + }, + { + "epoch": 2.91, + "learning_rate": 9.788672865595943e-09, + "logits/chosen": -1.4210089445114136, + "logits/rejected": -1.3463882207870483, + "logps/chosen": -85.81571197509766, + "logps/rejected": -118.3035659790039, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9070496559143066, + "rewards/margins": 27.036890029907227, + "rewards/rejected": -26.129840850830078, + "step": 6380 + }, + { + "epoch": 2.92, + "learning_rate": 9.281487743026204e-09, + "logits/chosen": -1.4335598945617676, + "logits/rejected": -1.355756163597107, + "logps/chosen": -90.24996185302734, + "logps/rejected": -118.03495025634766, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1046497821807861, + "rewards/margins": 27.585540771484375, + "rewards/rejected": -26.480892181396484, + "step": 6390 + }, + { + "epoch": 2.92, + "learning_rate": 8.774302620456466e-09, + "logits/chosen": -1.3927547931671143, + "logits/rejected": -1.3146473169326782, + "logps/chosen": -85.73943328857422, + "logps/rejected": -122.47743225097656, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.633237600326538, + "rewards/margins": 30.070592880249023, + "rewards/rejected": -27.43735694885254, + "step": 6400 + }, + { + "epoch": 2.92, + "eval_logits/chosen": -1.5202581882476807, + "eval_logits/rejected": -1.428078293800354, + "eval_logps/chosen": -88.32048034667969, + "eval_logps/rejected": -119.28140258789062, + "eval_loss": 0.007134352345019579, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.5559725165367126, + "eval_rewards/margins": 27.525819778442383, + "eval_rewards/rejected": -26.969846725463867, + "eval_runtime": 69.0433, + "eval_samples_per_second": 41.452, + "eval_steps_per_second": 2.593, + "step": 6400 + }, + { + "epoch": 2.93, + "learning_rate": 8.267117497886729e-09, + "logits/chosen": -1.4425251483917236, + "logits/rejected": -1.3791589736938477, + "logps/chosen": -87.10059356689453, + "logps/rejected": -123.00959777832031, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5705853700637817, + "rewards/margins": 27.75478744506836, + "rewards/rejected": -27.1842041015625, + "step": 6410 + }, + { + "epoch": 2.93, + "learning_rate": 7.75993237531699e-09, + "logits/chosen": -1.3837153911590576, + "logits/rejected": -1.322196364402771, + "logps/chosen": -84.60411071777344, + "logps/rejected": -121.41426849365234, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.2655871510505676, + "rewards/margins": 26.094432830810547, + "rewards/rejected": -25.828845977783203, + "step": 6420 + }, + { + "epoch": 2.93, + "learning_rate": 7.252747252747252e-09, + "logits/chosen": -1.4268747568130493, + "logits/rejected": -1.349467158317566, + "logps/chosen": -89.53246307373047, + "logps/rejected": -123.66633605957031, + "loss": 0.0036, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6228126883506775, + "rewards/margins": 29.124929428100586, + "rewards/rejected": -28.502117156982422, + "step": 6430 + }, + { + "epoch": 2.94, + "learning_rate": 6.745562130177514e-09, + "logits/chosen": -1.3894002437591553, + "logits/rejected": -1.328368067741394, + "logps/chosen": -85.84336853027344, + "logps/rejected": -120.75773620605469, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8587135076522827, + "rewards/margins": 26.4877986907959, + "rewards/rejected": -25.62908363342285, + "step": 6440 + }, + { + "epoch": 2.94, + "learning_rate": 6.238377007607776e-09, + "logits/chosen": -1.4132146835327148, + "logits/rejected": -1.3355062007904053, + "logps/chosen": -87.57127380371094, + "logps/rejected": -124.08438873291016, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5653873682022095, + "rewards/margins": 28.114559173583984, + "rewards/rejected": -27.549169540405273, + "step": 6450 + }, + { + "epoch": 2.95, + "learning_rate": 5.731191885038039e-09, + "logits/chosen": -1.4229376316070557, + "logits/rejected": -1.3501025438308716, + "logps/chosen": -88.9939956665039, + "logps/rejected": -123.60148620605469, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9622406959533691, + "rewards/margins": 28.20614242553711, + "rewards/rejected": -27.243900299072266, + "step": 6460 + }, + { + "epoch": 2.95, + "learning_rate": 5.224006762468301e-09, + "logits/chosen": -1.3894935846328735, + "logits/rejected": -1.3000738620758057, + "logps/chosen": -93.95217895507812, + "logps/rejected": -121.9637451171875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0586116313934326, + "rewards/margins": 28.589069366455078, + "rewards/rejected": -27.530452728271484, + "step": 6470 + }, + { + "epoch": 2.96, + "learning_rate": 4.7168216398985626e-09, + "logits/chosen": -1.396301507949829, + "logits/rejected": -1.3185522556304932, + "logps/chosen": -89.41260528564453, + "logps/rejected": -119.04069519042969, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.45848971605300903, + "rewards/margins": 27.00503158569336, + "rewards/rejected": -26.54654312133789, + "step": 6480 + }, + { + "epoch": 2.96, + "learning_rate": 4.209636517328825e-09, + "logits/chosen": -1.3974330425262451, + "logits/rejected": -1.3252642154693604, + "logps/chosen": -85.48316955566406, + "logps/rejected": -120.0402603149414, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14276909828186035, + "rewards/margins": 27.301326751708984, + "rewards/rejected": -27.158557891845703, + "step": 6490 + }, + { + "epoch": 2.97, + "learning_rate": 3.702451394759087e-09, + "logits/chosen": -1.4150840044021606, + "logits/rejected": -1.333387017250061, + "logps/chosen": -89.85456085205078, + "logps/rejected": -121.59066009521484, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14147010445594788, + "rewards/margins": 26.220012664794922, + "rewards/rejected": -26.078542709350586, + "step": 6500 + }, + { + "epoch": 2.97, + "eval_logits/chosen": -1.524153232574463, + "eval_logits/rejected": -1.4333622455596924, + "eval_logps/chosen": -88.29774475097656, + "eval_logps/rejected": -119.24208068847656, + "eval_loss": 0.006867639254778624, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.5673406720161438, + "eval_rewards/margins": 27.51752281188965, + "eval_rewards/rejected": -26.95018196105957, + "eval_runtime": 78.1379, + "eval_samples_per_second": 36.628, + "eval_steps_per_second": 2.291, + "step": 6500 + } + ], + "logging_steps": 10, + "max_steps": 6573, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..f768c1c --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:deb527c40f0380e692d8e56f2bc7284fa72d5efdb1afeb6277dd969e90c67fb3 +size 5560 diff --git a/zero_to_fp32.py b/zero_to_fp32.py new file mode 100644 index 0000000..c98caae --- /dev/null +++ b/zero_to_fp32.py @@ -0,0 +1,587 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)